diff --git a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index 22a3317c8c4482a6944d8b87262ab3d191f9a403..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:271c61d9d67897ac41908b4fe3ac455fb1512774f1f8ffce2bb4ed7021afe4e0 -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index 6b1a5b6dd89f80ff76c09c5e4b441f7d711726b9..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae0ee3e5f6796d6698d2e567edd7afdfb51693a1deb01bb94aa39d8b64b82d1a -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_100/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index c9f4741d260ff246359f40ece67bae5f55637294..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e2a49ff0420335a48a1e651e35436a4e519e181ba522e810c9a1ca6ece9f1e0 -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index c6f4a8188f112b7f05cc786f48420ce2ec46d80d..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f906d13758bd7c5df50f7c12a009f9b82d515c9b961186cc68402b0bc5ec966e -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1000/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index c2d3cdb8cd3d8a38b5667bfc64aa115f47434fee..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2ec7e4908ca94b549c86603a156b7f2e30c8bd8114bc2904693925c325f57dfe -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index 1fea773282b0c3f68a8197cfb15b927d66301dcb..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1de182d371ce3581898995a92d963095195cd69dc16d2daabc4f10087b35d805 -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1100/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index 2baa25bd4442136da16e12a91d25e88d7d7557c5..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9cf939a6b4a538b0441a6729d04453cc18b2746b3a8162e353a2f9bb9dd56524 -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index 41a8af8f3bcefd6dddfd2c6a670088cb6c7dc9cd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01c1aa16def2900d7ba3e5e3f2b25ba1c660e249b78e616f49a63a3e2aa58fca -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1200/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index 5776b005f801a228fe3ed352e527f4b54c0c519e..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f4400bcaac2fdc5bc1c7416ae0b0b51f39bec1be4f3b7263e3ed00e1bcff1c9 -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index 296de0fa8bb5fc6c1887430a19c0e0f9261ed585..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4bb18a150bf4404cf129fc4998e6615ef9b67a94fe854e9ec9d5f46d86fc4f6b -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1300/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index a94c9c6579afc7de3c70bfcef79da0a073d35cd9..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d46621c4221f61c32863406e3fb5e987aae71e2e47fb12d82bfbfc889f632cfe -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index ed0ad742a81e9b22da94e2162b2e9bcc4d962169..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c3f9b3a33af39dcf59dedfbdb4e2227e5eea050bee8e7320d22139aa3fd2c974 -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_1400/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index 5171f2ac6d132d361bf70a28870a7d34c465b4a4..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d2e6dd5e965d2244c63e96dbb12182379ca4d73d61bd555abb905c5a04e81d8 -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index ac43891cfcbb862a42c22629393c387817d91084..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de91311b0117c597ef2a0268f4f3e4df0850f239fbb2d3321282cb5a81216c3c -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_200/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index de30a597926d79d7effe424480e7369eb884ee74..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c498324ea22efac609f194fd243d46c3d30e2f60c6ed91b481662cafe40f3990 -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index ae7509c3adec0cd2961cc1768061dc384b604636..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c6ada05a264d966471641f7df62bab81c5dc399f496ee587bbdd1970721411c4 -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_300/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index 7910c5cbf13aaab09c2009c0e4a8161606316fb8..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1d79581cfb4105212021d35e3a35e7fde5f60ff0d0e5973361036b5726bd0c19 -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index b1194b3d0ab3aabc2b507231c3f654d4207fb014..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fa289ad30f450617e40eae86d1ac733156ba9f5d383b805a749cb37202b8c3d2 -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_400/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index 0833a0ad53901372e0d2c0bfcad675a4d6e42e6f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db47578d4085aadd4e2f8a4c267205b33a744d186b08028c1c8abe85bd8c31f6 -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index 8247d9bf13393e763ffd8af3ef38800491c4a96b..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:035d3d7c5f675d00e2fb589b56c0956aa52103fef4a40060fef1bc422f7b9273 -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_500/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index c62a46f7cff1bc5fec7b826d64a93da68f90a2e8..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:adcc36285ad2ed88db22f6421034b4985a7b11d7bd2447a9679c572c8d95bf0e -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index bfbc8af6ad62265111f7c73aa3bd45d4f069f66b..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9d2b7ca4825e56bebe3533b80c006605a61721c1716357fa9923ebfc7a39e338 -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_600/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index 916d3a040e6aa6ead5fae6fe541d7e657cfd83e1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d956a4cf85feb11ead6f24e148fb798be1bb05d7b79a44e13b4f7642ec82508 -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index 2153c03b99fd4c5858e7eddd523cda87adb806ac..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:74bcdcee5616240d4fc34ad5fb18c8a030ace87d7e7081119112d24fc3c6bd89 -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_700/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index ec12d1c1d58833d135ad3ccbec71415338c9f482..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2286330e4ea29661ded799384d336e8a66a76f4c3ab0bf20f0acbd2546b91e20 -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index 4c1b42ba6175708d83f085e92c7c3c0ef4f24c1b..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:267555f60e1baf5e4bf40c357a9dedbd028885c2e746b0a133cf6b7986325f6d -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_800/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -} diff --git a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/config.json b/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/config.json deleted file mode 100644 index 32f3f2919a6c81f43402807e3ff76ec75346825f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151646, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 1536, - "initializer_range": 0.02, - "intermediate_size": 8960, - "max_position_embeddings": 131072, - "max_window_layers": 21, - "model_type": "qwen2", - "num_attention_heads": 12, - "num_hidden_layers": 28, - "num_key_value_heads": 2, - "pad_token_id": 151643, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/generation_config.json b/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/generation_config.json deleted file mode 100644 index 92878bd36a6f22c0ad39d3eecd6839be7eeab4ab..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/generation_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 151646, - "do_sample": true, - "eos_token_id": 151643, - "temperature": 0.6, - "top_p": 0.95, - "transformers_version": "4.51.3" -} diff --git a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/model-00001-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/model-00001-of-00002.safetensors deleted file mode 100644 index 845b02e92355789e36fd8869e1098856053d8b53..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/model-00001-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ed4dd0832a61ee063cb4d0a67b702e97b0503c29818c2b3877db67660e19b12 -size 4996670464 diff --git a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/model-00002-of-00002.safetensors b/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/model-00002-of-00002.safetensors deleted file mode 100644 index caf6ebd6aaa0f9b484a1f7e9e95ba44ac21afa9f..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/model-00002-of-00002.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c9e33fadf302429c64ce6ca26bbd17b63d3dbd23bdf8652002ed236d4496472 -size 2111719976 diff --git a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/model.safetensors.index.json b/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/model.safetensors.index.json deleted file mode 100644 index 2277217d5e56a0aebb2b5b4bb5f330ada8c3ae1a..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/model.safetensors.index.json +++ /dev/null @@ -1,346 +0,0 @@ -{ - "metadata": { - "total_size": 7108352000 - }, - "weight_map": { - "lm_head.weight": "model-00002-of-00002.safetensors", - "model.embed_tokens.weight": "model-00001-of-00002.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", - "model.norm.weight": "model-00002-of-00002.safetensors" - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/special_tokens_map.json b/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/special_tokens_map.json deleted file mode 100644 index 1d385d62cf08bca35254547902b792c243656ec1..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/special_tokens_map.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "bos_token": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/tokenizer.json b/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/tokenizer.json deleted file mode 100644 index 1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 -size 11422778 diff --git a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/tokenizer_config.json b/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/tokenizer_config.json deleted file mode 100644 index ef6e98c3e0446cad00c5e6fb6bf2f5bbaf2eb0bd..0000000000000000000000000000000000000000 --- a/distill-qwen-1-5b_gspo/global_step_900/actor/huggingface/tokenizer_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "add_bos_token": true, - "add_eos_token": false, - "add_prefix_space": null, - "added_tokens_decoder": { - "151643": { - "content": "<|end▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151644": { - "content": "<|User|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151645": { - "content": "<|Assistant|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151646": { - "content": "<|begin▁of▁sentence|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151647": { - "content": "<|EOT|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151648": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151649": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151650": { - "content": "<|quad_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151651": { - "content": "<|quad_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151652": { - "content": "<|vision_start|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151653": { - "content": "<|vision_end|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151654": { - "content": "<|vision_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151655": { - "content": "<|image_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151656": { - "content": "<|video_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "151657": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151658": { - "content": "", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151659": { - "content": "<|fim_prefix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151660": { - "content": "<|fim_middle|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151661": { - "content": "<|fim_suffix|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151662": { - "content": "<|fim_pad|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151663": { - "content": "<|repo_name|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - }, - "151664": { - "content": "<|file_sep|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", - "clean_up_tokenization_spaces": false, - "eos_token": "<|end▁of▁sentence|>", - "extra_special_tokens": {}, - "legacy": true, - "model_max_length": 16384, - "pad_token": "<|end▁of▁sentence|>", - "sp_model_kwargs": {}, - "tokenizer_class": "LlamaTokenizerFast", - "unk_token": null, - "use_default_system_prompt": false -}