diff --git a/distill-qwen-7b_skywork/global_step_100/.DS_Store b/distill-qwen-7b_skywork/global_step_100/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_100/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_100/config.json b/distill-qwen-7b_skywork/global_step_100/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_100/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_100/generation_config.json b/distill-qwen-7b_skywork/global_step_100/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_100/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_100/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_100/model-00001-of-00004.safetensors deleted file mode 100644 index 215a01ad2f49a36bcf4c85648771ecb726af0d5e..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_100/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9dc6f289f92f0cf5876ce95f24d859949319a7911d598ad87d37b1402d4a834c -size 4962027176 diff --git a/distill-qwen-7b_skywork/global_step_100/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_100/model-00002-of-00004.safetensors deleted file mode 100644 index 1172736396cca2d7e2d6224c8304d1c4e89f1a34..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_100/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:126f733aed57974fe690e965c481471ae138ea13cb789293d05472159ca85294 -size 4984129904 diff --git a/distill-qwen-7b_skywork/global_step_100/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_100/model-00003-of-00004.safetensors deleted file mode 100644 index 40bd630c6a43ed55d96565a48c3b2ad94e54d9cb..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_100/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:74a953b91caacafdd5c100d42dd8c230816a00a3aa84fac7a68893c0c12dbd8b -size 4929112864 diff --git a/distill-qwen-7b_skywork/global_step_100/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_100/model-00004-of-00004.safetensors deleted file mode 100644 index 57ea8c08f126a5050a1221c7fabe470a4161c477..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_100/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b9f92c6ca72e5aa27596e8cd575a7d4ae7e0f14fe6eaf89883082f545c268fa -size 356001928 diff --git a/distill-qwen-7b_skywork/global_step_100/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_100/model.safetensors.index.json deleted file mode 100644 index af3b6bcb876abad01cd17b8994977cfa314efb11..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_100/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00004.safetensors", - "model.embed_tokens.weight": "model-00001-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.norm.weight": "model-00003-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_100/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_100/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_100/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_100/tokenizer.json b/distill-qwen-7b_skywork/global_step_100/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_100/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_100/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_100/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_100/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_1000/.DS_Store b/distill-qwen-7b_skywork/global_step_1000/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_1000/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_1000/config.json b/distill-qwen-7b_skywork/global_step_1000/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1000/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_1000/generation_config.json b/distill-qwen-7b_skywork/global_step_1000/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1000/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_1000/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1000/model-00001-of-00004.safetensors deleted file mode 100644 index 65967d7e6d5aa7dc4451633b6d7de91b43ca4a04..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1000/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:971ae73173402bb48f92ef52c3668360b5c7523bac0afbe08ca06fcb779a6a59 -size 4184094576 diff --git a/distill-qwen-7b_skywork/global_step_1000/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1000/model-00002-of-00004.safetensors deleted file mode 100644 index 166f510a6f3b3e56adc8b2805515ef7e908d8827..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1000/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ea1ffc97c0e366ebb6ae97e7bddcda1db54833d4324d488dc8e9e0cd2bd3236 -size 4991452152 diff --git a/distill-qwen-7b_skywork/global_step_1000/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1000/model-00003-of-00004.safetensors deleted file mode 100644 index a84618b2845c462cc6c9abe9934a84b86659429e..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1000/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe05f227cb9e540765dd51c1d413a7e7ac414a7362fa79200fd29189b8307250 -size 4991370656 diff --git a/distill-qwen-7b_skywork/global_step_1000/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1000/model-00004-of-00004.safetensors deleted file mode 100644 index 6f38144be6e5ee35db9330a8c470584acb0b5c22..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1000/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a6c6087c61af0964c573377081d9fc17514427bd60c6dc6ff6950550cef60c5c -size 1064354488 diff --git a/distill-qwen-7b_skywork/global_step_1000/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1000/model.safetensors.index.json deleted file mode 100644 index 96369923233b9a962ab1eb704b7460e040396214..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1000/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00004.safetensors", - "model.embed_tokens.weight": "model-00001-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.norm.weight": "model-00003-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_1000/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1000/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1000/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_1000/tokenizer.json b/distill-qwen-7b_skywork/global_step_1000/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1000/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_1000/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1000/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1000/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_1100/.DS_Store b/distill-qwen-7b_skywork/global_step_1100/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_1100/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_1100/config.json b/distill-qwen-7b_skywork/global_step_1100/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1100/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_1100/generation_config.json b/distill-qwen-7b_skywork/global_step_1100/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1100/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_1100/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1100/model-00001-of-00004.safetensors deleted file mode 100644 index 818d31f9d39c8a6b335ea56712cd3fe01bac01b3..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1100/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc8e945b1a813496d7c7898fdd6830ea08acf55d77f9664609740a995232985c -size 4283087968 diff --git a/distill-qwen-7b_skywork/global_step_1100/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1100/model-00002-of-00004.safetensors deleted file mode 100644 index ea06780fc72522e200ee88d875a46598bf0f5188..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1100/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:09dcad711b125f9919aba8a6a381b105c7718baa92d2b6933e5105ce485052d5 -size 4954685520 diff --git a/distill-qwen-7b_skywork/global_step_1100/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1100/model-00003-of-00004.safetensors deleted file mode 100644 index cc2ddb85a26978b5b09a826e171f8ad1e70aa948..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1100/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:448e2a6eaf2bc8f0e6f7ccac6af29f6c7478798738d19db18b7259ca1ab1c2d3 -size 4914443120 diff --git a/distill-qwen-7b_skywork/global_step_1100/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1100/model-00004-of-00004.safetensors deleted file mode 100644 index fee21e8c2e1d8ac1f4078c760db8d0df1b06dcbe..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1100/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2fe962b869f59148380ce3c203fdcf18a51819efa5bc6b44f4704fc26f423d1c -size 1079055224 diff --git a/distill-qwen-7b_skywork/global_step_1100/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1100/model.safetensors.index.json deleted file mode 100644 index c180e5258fb9dc2b9864b49c9fab2eb29f7dac61..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1100/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00004.safetensors", - "model.embed_tokens.weight": "model-00003-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.norm.weight": "model-00002-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_1100/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1100/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1100/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_1100/tokenizer.json b/distill-qwen-7b_skywork/global_step_1100/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1100/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_1100/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1100/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1100/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_1200/.DS_Store b/distill-qwen-7b_skywork/global_step_1200/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_1200/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_1200/config.json b/distill-qwen-7b_skywork/global_step_1200/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1200/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_1200/generation_config.json b/distill-qwen-7b_skywork/global_step_1200/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1200/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_1200/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1200/model-00001-of-00004.safetensors deleted file mode 100644 index c0a9fab1f448fdb8b133e500fa9b5796977939bc..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1200/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1e7a89a83e519f329468f89ccd082fb4e63858cbfc4455cab98d7782752b06fb -size 4998801712 diff --git a/distill-qwen-7b_skywork/global_step_1200/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1200/model-00002-of-00004.safetensors deleted file mode 100644 index 5ad495c464fead72065927ea0e21d90104d19ef5..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1200/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:934f2356e826d0a9990b2f2defdc572b67f17cf236977a635396fac5b2178cb6 -size 4995129568 diff --git a/distill-qwen-7b_skywork/global_step_1200/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1200/model-00003-of-00004.safetensors deleted file mode 100644 index e7b9af08ddc567967fdee708029334ad232b6aac..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1200/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c786059b2f8659a0959012886ef44f5375c1bbc70f37ef47225bcddd74edfb13 -size 4936361296 diff --git a/distill-qwen-7b_skywork/global_step_1200/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1200/model-00004-of-00004.safetensors deleted file mode 100644 index ac9e8f1fe11798cc6b678688a8c226c72fa37d06..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1200/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3647c6edaff02019bb1e6068924653a7e8d9bf55189e429f0100c2ba308dbfb7 -size 300979328 diff --git a/distill-qwen-7b_skywork/global_step_1200/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1200/model.safetensors.index.json deleted file mode 100644 index 217610f418679677d301bfce864cfc0c25f40dcf..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1200/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00001-of-00004.safetensors", - "model.embed_tokens.weight": "model-00003-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.norm.weight": "model-00002-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_1200/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1200/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1200/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_1200/tokenizer.json b/distill-qwen-7b_skywork/global_step_1200/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1200/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_1200/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1200/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1200/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_1300/.DS_Store b/distill-qwen-7b_skywork/global_step_1300/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_1300/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_1300/config.json b/distill-qwen-7b_skywork/global_step_1300/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1300/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_1300/generation_config.json b/distill-qwen-7b_skywork/global_step_1300/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1300/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_1300/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1300/model-00001-of-00004.safetensors deleted file mode 100644 index a805947a1dd2d06d3c9d767e2115772d633aa387..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1300/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:47f10fe9bae60b03dfd7885cce6923331f95332375d9acce18ca1a27055c3506 -size 4951095456 diff --git a/distill-qwen-7b_skywork/global_step_1300/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1300/model-00002-of-00004.safetensors deleted file mode 100644 index 5b394bcede04b46b288298d6679ba7191c9b5e33..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1300/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a6afd8e8077831a61084f68c88e58a80a393990d174a1e58c464fcd7697faa22 -size 4995138368 diff --git a/distill-qwen-7b_skywork/global_step_1300/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1300/model-00003-of-00004.safetensors deleted file mode 100644 index a6d4fa7a5eda1a0234fc144a6bffd3127a1645d7..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1300/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7d99a8eb68d2b5aded9c4107febb970f6785de109844c2d2d6d90e2226c6f3d2 -size 4984092808 diff --git a/distill-qwen-7b_skywork/global_step_1300/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1300/model-00004-of-00004.safetensors deleted file mode 100644 index 9fb5f5e0477af377edf403ee186fd8b610c6a8fb..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1300/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:24090e6b8b8dc6b5f21464363354bb781b7b2e91b1448051d36be77b218a9602 -size 300945168 diff --git a/distill-qwen-7b_skywork/global_step_1300/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1300/model.safetensors.index.json deleted file mode 100644 index 937e1036823ae591dd95c10da2d76fa263a9a67b..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1300/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00001-of-00004.safetensors", - "model.embed_tokens.weight": "model-00001-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.norm.weight": "model-00001-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_1300/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1300/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1300/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_1300/tokenizer.json b/distill-qwen-7b_skywork/global_step_1300/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1300/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_1300/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1300/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1300/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_1400/.DS_Store b/distill-qwen-7b_skywork/global_step_1400/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_1400/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_1400/config.json b/distill-qwen-7b_skywork/global_step_1400/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1400/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_1400/generation_config.json b/distill-qwen-7b_skywork/global_step_1400/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1400/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_1400/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1400/model-00001-of-00004.safetensors deleted file mode 100644 index a1e519a23f6cbbd7d5c3d6d94630e26498729dbe..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1400/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:82b09e3d2b014653c1c248738bd3297562b299f7cd97d6e8ed74b5519c140fa7 -size 4947397128 diff --git a/distill-qwen-7b_skywork/global_step_1400/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1400/model-00002-of-00004.safetensors deleted file mode 100644 index 9ab3b84354fa77310c414ff2692ce0f5ffccbfef..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1400/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a360b4fd107a1a52b071bc547eceb1773658540a579467118a152d31a4dc72bb -size 4187708088 diff --git a/distill-qwen-7b_skywork/global_step_1400/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1400/model-00003-of-00004.safetensors deleted file mode 100644 index 7a6132ff74f52f513f6803ecf4b1ffd570eadc43..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1400/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0c912cb1d46f15111c2bc252d88705f2a639b2855b189dc8363967d530983f9d -size 4929029456 diff --git a/distill-qwen-7b_skywork/global_step_1400/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1400/model-00004-of-00004.safetensors deleted file mode 100644 index 96cc35943adbd885365c502ba0fcfc58363dfdfb..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1400/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb505fd01d5a806cc61cd0b5b81fae160aaf59295a8af902ac9a580f1eb31015 -size 1167137192 diff --git a/distill-qwen-7b_skywork/global_step_1400/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1400/model.safetensors.index.json deleted file mode 100644 index b9deaf3e9af95ff95cd9b577564dd2b05edcfef3..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1400/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00004.safetensors", - "model.embed_tokens.weight": "model-00003-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.norm.weight": "model-00004-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_1400/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1400/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1400/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_1400/tokenizer.json b/distill-qwen-7b_skywork/global_step_1400/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1400/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_1400/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1400/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1400/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_1500/.DS_Store b/distill-qwen-7b_skywork/global_step_1500/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_1500/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_1500/config.json b/distill-qwen-7b_skywork/global_step_1500/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1500/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_1500/generation_config.json b/distill-qwen-7b_skywork/global_step_1500/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1500/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_1500/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1500/model-00001-of-00004.safetensors deleted file mode 100644 index 7c79a71cbd7b6fec1bc2b12bb349e26fe99568b4..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1500/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:336ac46e60479c19658f3c0065a39ef10a358111bf439f2ecdc15e51ca02b51c -size 4980433808 diff --git a/distill-qwen-7b_skywork/global_step_1500/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1500/model-00002-of-00004.safetensors deleted file mode 100644 index 1e049570ef4f9215630c4f867637a8de07d4e292..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1500/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0a5e0e3cb71b4694925705f2bc9cbe7f683ec770ac84ee82cc7e992526532533 -size 4914411904 diff --git a/distill-qwen-7b_skywork/global_step_1500/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1500/model-00003-of-00004.safetensors deleted file mode 100644 index 22998e5c2e5dc1f7dea3e5fc989706827ebe5e13..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1500/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1594bcffe7bc376316c8dd4cc2f04efaacdaef6f0576de70b4f7a59d50d80731 -size 4895993248 diff --git a/distill-qwen-7b_skywork/global_step_1500/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_1500/model-00004-of-00004.safetensors deleted file mode 100644 index 322360a1378a959a3402cddc77996bd8eedf70b8..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1500/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:581112b9c0d111481a5c0bf06227b237329754eec6113e21af5e8ca4ca16332d -size 440432904 diff --git a/distill-qwen-7b_skywork/global_step_1500/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_1500/model.safetensors.index.json deleted file mode 100644 index af716bc34b88ddfa57a57f77cdf79bff42101966..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1500/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00001-of-00004.safetensors", - "model.embed_tokens.weight": "model-00003-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.norm.weight": "model-00001-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_1500/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_1500/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1500/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_1500/tokenizer.json b/distill-qwen-7b_skywork/global_step_1500/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1500/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_1500/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_1500/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_1500/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_200/.DS_Store b/distill-qwen-7b_skywork/global_step_200/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_200/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_200/config.json b/distill-qwen-7b_skywork/global_step_200/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_200/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_200/generation_config.json b/distill-qwen-7b_skywork/global_step_200/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_200/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_200/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_200/model-00001-of-00004.safetensors deleted file mode 100644 index e40cb6e9501942411089b78c861f9a4fd5ab9ca3..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_200/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:81d8b4fca9b10c19cb360a15e14717176fa0498b493a37270fa872fc7b23ab8c -size 4947504304 diff --git a/distill-qwen-7b_skywork/global_step_200/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_200/model-00002-of-00004.safetensors deleted file mode 100644 index c321265ea94e0a7716f7d901317de701108c2782..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_200/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64a974dbcb5aafc8caaf3c98a7465d656fb7ed8a599dc6881d6c332b23367c7f -size 4084900936 diff --git a/distill-qwen-7b_skywork/global_step_200/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_200/model-00003-of-00004.safetensors deleted file mode 100644 index 5b257e730a334782f24268e8b6c410b303f08c42..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_200/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d5409a381b498f85d5d915b4c9b1c66303e91fed1270b4d0bcdc53656ae312a -size 4921678760 diff --git a/distill-qwen-7b_skywork/global_step_200/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_200/model-00004-of-00004.safetensors deleted file mode 100644 index 952197cf418a52e7fb41d0154cddb618b80387de..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_200/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bebf9030b36c5b7b3f963b91ee74dad7f822b6659ef248b072655bdce895d8eb -size 1277187904 diff --git a/distill-qwen-7b_skywork/global_step_200/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_200/model.safetensors.index.json deleted file mode 100644 index 9b382620d0653556962aee51816e93fbc0e33f17..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_200/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00004-of-00004.safetensors", - "model.embed_tokens.weight": "model-00003-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.norm.weight": "model-00001-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_200/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_200/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_200/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_200/tokenizer.json b/distill-qwen-7b_skywork/global_step_200/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_200/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_200/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_200/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_200/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_300/.DS_Store b/distill-qwen-7b_skywork/global_step_300/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_300/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_300/config.json b/distill-qwen-7b_skywork/global_step_300/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_300/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_300/generation_config.json b/distill-qwen-7b_skywork/global_step_300/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_300/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_300/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_300/model-00001-of-00004.safetensors deleted file mode 100644 index dfd7cdb35bcdd56d31860772acace106b3cac04b..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_300/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:331f37ea0a304e50b4ad6e912e148ee273290d57b14b6b0d0ff6767dfcbe9927 -size 4954832976 diff --git a/distill-qwen-7b_skywork/global_step_300/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_300/model-00002-of-00004.safetensors deleted file mode 100644 index 3c443e97fd5857702382da88a085e107d3a64621..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_300/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3849012076a7cc74415e145196c26cfad3bc9445fcb173cfd04f88f478cac752 -size 4106921944 diff --git a/distill-qwen-7b_skywork/global_step_300/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_300/model-00003-of-00004.safetensors deleted file mode 100644 index 4a1eacd533eb78f6e7564bf7baf4d05ffc792574..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_300/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e4a912a681a1ddff32ae4e0e01e05811f22fd464d22b39008312adea3ca12b0 -size 4866528464 diff --git a/distill-qwen-7b_skywork/global_step_300/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_300/model-00004-of-00004.safetensors deleted file mode 100644 index 3b0930a8898fc66d7f6c487a48f04efe0915a5a5..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_300/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:744adb0189d5af47dae565fdefbe126034627d072c04f8f3df74f7d79b626822 -size 1302988440 diff --git a/distill-qwen-7b_skywork/global_step_300/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_300/model.safetensors.index.json deleted file mode 100644 index a4529830414761188d82c035f2b5b7cd2668c6c7..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_300/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00004.safetensors", - "model.embed_tokens.weight": "model-00003-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.norm.weight": "model-00001-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_300/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_300/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_300/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_300/tokenizer.json b/distill-qwen-7b_skywork/global_step_300/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_300/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_300/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_300/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_300/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_400/.DS_Store b/distill-qwen-7b_skywork/global_step_400/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_400/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_400/config.json b/distill-qwen-7b_skywork/global_step_400/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_400/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_400/generation_config.json b/distill-qwen-7b_skywork/global_step_400/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_400/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_400/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_400/model-00001-of-00004.safetensors deleted file mode 100644 index 279b0c4a58835ec992d4165f8db0f6b2467a0e6c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_400/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:814ef6a41e9e6d83ca1d2eeb768d0b3c986532f997fad886dc50998f09c5dd91 -size 4940067136 diff --git a/distill-qwen-7b_skywork/global_step_400/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_400/model-00002-of-00004.safetensors deleted file mode 100644 index a4e90068298bc30e4fc480e1b9df3bd9fac7a5a9..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_400/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:845c9aba0c3356c131165f66eb3edbd87427c7a3ade0045623e19a0a4d24f38a -size 4914396672 diff --git a/distill-qwen-7b_skywork/global_step_400/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_400/model-00003-of-00004.safetensors deleted file mode 100644 index 246b18594a282ac80ae156342e023b776efc306e..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_400/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e648078312503667c04931bbf71fb9a2d20ab1559f6e70348d5c5326c223a137 -size 4943731160 diff --git a/distill-qwen-7b_skywork/global_step_400/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_400/model-00004-of-00004.safetensors deleted file mode 100644 index dc81787c288f2b1e74b787506126743c4cb2ed7b..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_400/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae37ccb1b23e903e439a283912de61ff1d936fbe61e14b8dd25022a8911460eb -size 433076936 diff --git a/distill-qwen-7b_skywork/global_step_400/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_400/model.safetensors.index.json deleted file mode 100644 index 5a9bdebb2018665328da9c2817fb5161fc52ddd8..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_400/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00004.safetensors", - "model.embed_tokens.weight": "model-00003-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.norm.weight": "model-00002-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_400/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_400/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_400/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_400/tokenizer.json b/distill-qwen-7b_skywork/global_step_400/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_400/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_400/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_400/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_400/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_500/.DS_Store b/distill-qwen-7b_skywork/global_step_500/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_500/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_500/config.json b/distill-qwen-7b_skywork/global_step_500/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_500/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_500/generation_config.json b/distill-qwen-7b_skywork/global_step_500/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_500/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_500/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_500/model-00001-of-00004.safetensors deleted file mode 100644 index ce23684f810c68b9ee6c9bfaea3aeb305cb01501..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_500/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c2ac1371059d49b908334d99cc3b7484f782166af7a548de708a8736321274d4 -size 4268426296 diff --git a/distill-qwen-7b_skywork/global_step_500/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_500/model-00002-of-00004.safetensors deleted file mode 100644 index ccc1261432f98958570f98a53d5a096738059f56..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_500/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:debd52b643de552189902ccb697b874f7015ecabb6de7b26126e3d1a2a7ff702 -size 4899713232 diff --git a/distill-qwen-7b_skywork/global_step_500/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_500/model-00003-of-00004.safetensors deleted file mode 100644 index c0dfd3d1cd459c0ce13b72f54572e3640e9869c7..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_500/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d43b3ead2373211dbc9585ad968b5f96cc50b0a2ea4762d03ceebad87d337ca -size 4987750648 diff --git a/distill-qwen-7b_skywork/global_step_500/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_500/model-00004-of-00004.safetensors deleted file mode 100644 index b829c8bdf599be939028205c17a679f250c83b35..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_500/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18cc612bfdb9f285acff2b2904416cf38376bb6f784bba9fff61814f8460bf39 -size 1075381664 diff --git a/distill-qwen-7b_skywork/global_step_500/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_500/model.safetensors.index.json deleted file mode 100644 index bfa2b2b867acae3b6be5eb21dd9f5e66e7114318..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_500/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00004.safetensors", - "model.embed_tokens.weight": "model-00003-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.norm.weight": "model-00004-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_500/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_500/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_500/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_500/tokenizer.json b/distill-qwen-7b_skywork/global_step_500/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_500/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_500/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_500/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_500/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_600/.DS_Store b/distill-qwen-7b_skywork/global_step_600/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_600/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_600/config.json b/distill-qwen-7b_skywork/global_step_600/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_600/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_600/generation_config.json b/distill-qwen-7b_skywork/global_step_600/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_600/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_600/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_600/model-00001-of-00004.safetensors deleted file mode 100644 index 348dd83a8b672f35e0afae903f6aa073900409a2..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_600/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb7503bc994b9fedaf63eb2ba3deb62e1640a430dea57a5948e713766b0e5510 -size 4910668944 diff --git a/distill-qwen-7b_skywork/global_step_600/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_600/model-00002-of-00004.safetensors deleted file mode 100644 index c4a0f43e1e0adb94e48c056b7ce0d72abacf2ec8..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_600/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a90641bedd3d3964e03128c17b8b0fc800129f83420338220211361a8a2ae8c3 -size 4995099544 diff --git a/distill-qwen-7b_skywork/global_step_600/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_600/model-00003-of-00004.safetensors deleted file mode 100644 index c74686b8ff839e0ea5852a6c579ccdc2bd8f7c17..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_600/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c1c01e503bdcef9f310997b5f4f3926db1d7ac32759719a6c37803c61241e22 -size 4918114408 diff --git a/distill-qwen-7b_skywork/global_step_600/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_600/model-00004-of-00004.safetensors deleted file mode 100644 index 6b63a73269b18fb5c375468ed20afdf461229305..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_600/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c2d7f5d8449c0cfdd76ef1f900a7075f0dd0c48e8a7de0cd443f060bc707535 -size 407388968 diff --git a/distill-qwen-7b_skywork/global_step_600/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_600/model.safetensors.index.json deleted file mode 100644 index 244d7555f61409c89800b4747af8e1dfbd78c2d0..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_600/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00001-of-00004.safetensors", - "model.embed_tokens.weight": "model-00001-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.norm.weight": "model-00002-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_600/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_600/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_600/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_600/tokenizer.json b/distill-qwen-7b_skywork/global_step_600/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_600/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_600/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_600/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_600/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_700/.DS_Store b/distill-qwen-7b_skywork/global_step_700/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_700/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_700/config.json b/distill-qwen-7b_skywork/global_step_700/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_700/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_700/generation_config.json b/distill-qwen-7b_skywork/global_step_700/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_700/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_700/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_700/model-00001-of-00004.safetensors deleted file mode 100644 index bc2ae04815f1473419dc77655add3f5010be53c9..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_700/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c6aa4dc81ae9211b7c5776bd1663ae0e18a240a00779fb9c72bad3031fa6df9c -size 4910738848 diff --git a/distill-qwen-7b_skywork/global_step_700/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_700/model-00002-of-00004.safetensors deleted file mode 100644 index 04f1018050b54840e7f96a26fff0b128236fe601..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_700/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ccf91c0d59345fa2ce5dde9c1ef74e355af33453d01c8b0f351acccd7eaecd66 -size 4984125784 diff --git a/distill-qwen-7b_skywork/global_step_700/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_700/model-00003-of-00004.safetensors deleted file mode 100644 index 064d2c911accb11a096b0c434830b80d78c7fd95..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_700/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2c4fb6a567b2a1d18783f07a9da1253e620d2fa804f69379987c069b4205f3c -size 4877637832 diff --git a/distill-qwen-7b_skywork/global_step_700/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_700/model-00004-of-00004.safetensors deleted file mode 100644 index a2a8318026b20080cce642090ae6f0125c764a7f..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_700/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:08df4f7e2e69350a262998c16d09fa8e11133e8db956c39a7868739601d9ccf6 -size 458769424 diff --git a/distill-qwen-7b_skywork/global_step_700/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_700/model.safetensors.index.json deleted file mode 100644 index b37ccd2ca5e4a0cde013e1ee19cdbf011a1860e4..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_700/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00004.safetensors", - "model.embed_tokens.weight": "model-00003-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.norm.weight": "model-00001-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_700/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_700/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_700/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_700/tokenizer.json b/distill-qwen-7b_skywork/global_step_700/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_700/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_700/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_700/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_700/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_800/.DS_Store b/distill-qwen-7b_skywork/global_step_800/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_800/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_800/config.json b/distill-qwen-7b_skywork/global_step_800/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_800/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_800/generation_config.json b/distill-qwen-7b_skywork/global_step_800/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_800/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_800/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_800/model-00001-of-00004.safetensors deleted file mode 100644 index fa7865fb14d0ab46999482b50ae2cad74cd97cff..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_800/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b483847e4bf48d689a1a3c245723c271e1facc0e90817c9818e9be4e59705524 -size 4991593472 diff --git a/distill-qwen-7b_skywork/global_step_800/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_800/model-00002-of-00004.safetensors deleted file mode 100644 index 9372ed927aa6cb0886b21a23a78a7cd9f95191b7..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_800/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fa655b6230697c8b94e8149bef3475bc0c94d89516006b4489a39bf15b01e5df -size 4936351000 diff --git a/distill-qwen-7b_skywork/global_step_800/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_800/model-00003-of-00004.safetensors deleted file mode 100644 index 438d479a1867d019eeab91c29b4e6b256041c62b..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_800/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:940533dfb5627308fa09465e1910b4ad483448a6a8e040d0bf0ec493c0388366 -size 4980358232 diff --git a/distill-qwen-7b_skywork/global_step_800/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_800/model-00004-of-00004.safetensors deleted file mode 100644 index 95ce0a292b1e702102475d1f56dc52d7c3065f00..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_800/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:872676364d1e3e15993c94e7f7a83ad990e0b6b6f82170340fc70696fa1b2302 -size 322969184 diff --git a/distill-qwen-7b_skywork/global_step_800/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_800/model.safetensors.index.json deleted file mode 100644 index 29103736d43f322d647cc4ea4b21ffd18c819461..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_800/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00003-of-00004.safetensors", - "model.embed_tokens.weight": "model-00002-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.norm.weight": "model-00001-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_800/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_800/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_800/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_800/tokenizer.json b/distill-qwen-7b_skywork/global_step_800/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_800/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_800/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_800/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_800/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-7b_skywork/global_step_900/.DS_Store b/distill-qwen-7b_skywork/global_step_900/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/distill-qwen-7b_skywork/global_step_900/.DS_Store and /dev/null differ diff --git a/distill-qwen-7b_skywork/global_step_900/config.json b/distill-qwen-7b_skywork/global_step_900/config.json deleted file mode 100644 index c916f6c19247e2671be9787cdad329139537c395..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_900/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/distill-qwen-7b_skywork/global_step_900/generation_config.json b/distill-qwen-7b_skywork/global_step_900/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_900/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-7b_skywork/global_step_900/model-00001-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_900/model-00001-of-00004.safetensors deleted file mode 100644 index 18e4bd57b47d2fdc2173e908bf31cafbb7cfe86b..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_900/model-00001-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7bcbe9e7d4510eb46128efac85da5d09dbbb8a4bf4b4c079305ffba428644dc -size 4958409344 diff --git a/distill-qwen-7b_skywork/global_step_900/model-00002-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_900/model-00002-of-00004.safetensors deleted file mode 100644 index fe2c913b9f22e83d50bc9affcdff4d5dc6af01da..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_900/model-00002-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1da9668b8a4bc1431cabb09655d953a24a7a93d4359768ee91cdfe8792a19d92 -size 4907092648 diff --git a/distill-qwen-7b_skywork/global_step_900/model-00003-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_900/model-00003-of-00004.safetensors deleted file mode 100644 index b741932bc2d459b00eb31ec3e4ec67b09456b96d..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_900/model-00003-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23d90fdb1a37df14cdf1ba2f54517e887b7898b025a9f3e35ccecb8a1415c47c -size 4004163312 diff --git a/distill-qwen-7b_skywork/global_step_900/model-00004-of-00004.safetensors b/distill-qwen-7b_skywork/global_step_900/model-00004-of-00004.safetensors deleted file mode 100644 index bdb1920ddbad3561a71c77d6855a6d7af27942ac..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_900/model-00004-of-00004.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e4c4fbac8af0b12b2426f01a6a0d67aad1ee609507b22ce5460309622434c595 -size 1361606560 diff --git a/distill-qwen-7b_skywork/global_step_900/model.safetensors.index.json b/distill-qwen-7b_skywork/global_step_900/model.safetensors.index.json deleted file mode 100644 index da93e31fddf4fd18384f45d8c1b640084d1b4617..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_900/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 15231233024 - }, - "weight_map": { - "lm_head.weight": "model-00001-of-00004.safetensors", - "model.embed_tokens.weight": "model-00004-of-00004.safetensors", - "model.layers.0.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.input_layernorm.weight": "model-00004-of-00004.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00004-of-00004.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00004-of-00004.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", - "model.norm.weight": "model-00001-of-00004.safetensors" - } -} diff --git a/distill-qwen-7b_skywork/global_step_900/special_tokens_map.json b/distill-qwen-7b_skywork/global_step_900/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_900/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-7b_skywork/global_step_900/tokenizer.json b/distill-qwen-7b_skywork/global_step_900/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_900/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-7b_skywork/global_step_900/tokenizer_config.json b/distill-qwen-7b_skywork/global_step_900/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-7b_skywork/global_step_900/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -}