bwshen-mi commited on Dec 16, 2025

Commit

1cefa2f

verified ·

1 Parent(s): d78b727

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

config.json +213 -0
configuration_mimo_v2_flash.py +109 -0
generation_config.json +7 -0
merges.txt +0 -0
model.safetensors.index.json +0 -0
model_10_linear_fc2.safetensors +3 -0
model_11_linear_fc1-00001-of-00002.safetensors +3 -0
model_11_linear_fc1-00002-of-00002.safetensors +3 -0
model_12_linear_fc1-00001-of-00002.safetensors +3 -0
model_22_linear_fc1-00001-of-00002.safetensors +3 -0
model_26_linear_fc1-00002-of-00002.safetensors +3 -0
model_29_linear_fc2.safetensors +3 -0
model_2_linear_fc1-00001-of-00002.safetensors +3 -0
model_36_linear_fc1-00002-of-00002.safetensors +3 -0
model_38_linear_fc2.safetensors +3 -0
model_3_linear_fc1-00001-of-00002.safetensors +3 -0
model_3_linear_fc1-00002-of-00002.safetensors +3 -0
model_40_linear_fc1-00001-of-00002.safetensors +3 -0
model_40_linear_fc1-00002-of-00002.safetensors +3 -0
model_41.safetensors +3 -0
model_42_linear_fc1-00002-of-00002.safetensors +3 -0
model_42_linear_fc2.safetensors +3 -0
model_43_linear_fc1-00001-of-00002.safetensors +3 -0
model_44_linear_fc1-00001-of-00002.safetensors +3 -0
model_44_linear_fc1-00002-of-00002.safetensors +3 -0
model_45_linear_fc2.safetensors +3 -0
model_46.safetensors +3 -0
model_47.safetensors +3 -0
model_4_linear_fc1-00001-of-00002.safetensors +3 -0
model_4_linear_fc1-00002-of-00002.safetensors +3 -0
model_5.safetensors +3 -0
model_6.safetensors +3 -0
model_6_linear_fc1-00001-of-00002.safetensors +3 -0
model_6_linear_fc1-00002-of-00002.safetensors +3 -0
model_6_linear_fc2.safetensors +3 -0
model_7.safetensors +3 -0
model_7_linear_fc1-00002-of-00002.safetensors +3 -0
model_7_linear_fc2.safetensors +3 -0
model_8.safetensors +3 -0
model_8_linear_fc1-00002-of-00002.safetensors +3 -0
model_8_linear_fc2.safetensors +3 -0
model_9.safetensors +3 -0
model_9_linear_fc1-00002-of-00002.safetensors +3 -0
model_9_linear_fc2.safetensors +3 -0
model_embedding.safetensors +3 -0
model_final.safetensors +3 -0
modeling_mimo_v2_flash.py +664 -0
tokenizer.json +0 -0
tokenizer_config.json +207 -0
vocab.json +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,213 @@

+{
+  "architectures": [
+    "MiMoV2FlashForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_mimo_v2_flash.MiMoV2FlashConfig",
+    "AutoModel": "modeling_mimo_v2_flash.MiMoV2FlashModel",
+    "AutoModelForCausalLM": "modeling_mimo_v2_flash.MiMoV2FlashForCausalLM"
+  },
+  "quantization_config": {
+    "activation_scheme": "dynamic",
+    "fmt": "e4m3",
+    "packed_modules_mapping": {},
+    "quant_method": "fp8",
+    "ignored_layers": [
+      "model.layers.0.self_attn.o_proj",
+      "model.layers.1.self_attn.o_proj",
+      "model.layers.2.self_attn.o_proj",
+      "model.layers.3.self_attn.o_proj",
+      "model.layers.4.self_attn.o_proj",
+      "model.layers.5.self_attn.o_proj",
+      "model.layers.6.self_attn.o_proj",
+      "model.layers.7.self_attn.o_proj",
+      "model.layers.8.self_attn.o_proj",
+      "model.layers.9.self_attn.o_proj",
+      "model.layers.10.self_attn.o_proj",
+      "model.layers.11.self_attn.o_proj",
+      "model.layers.12.self_attn.o_proj",
+      "model.layers.13.self_attn.o_proj",
+      "model.layers.14.self_attn.o_proj",
+      "model.layers.15.self_attn.o_proj",
+      "model.layers.16.self_attn.o_proj",
+      "model.layers.17.self_attn.o_proj",
+      "model.layers.18.self_attn.o_proj",
+      "model.layers.19.self_attn.o_proj",
+      "model.layers.20.self_attn.o_proj",
+      "model.layers.21.self_attn.o_proj",
+      "model.layers.22.self_attn.o_proj",
+      "model.layers.23.self_attn.o_proj",
+      "model.layers.24.self_attn.o_proj",
+      "model.layers.25.self_attn.o_proj",
+      "model.layers.26.self_attn.o_proj",
+      "model.layers.27.self_attn.o_proj",
+      "model.layers.28.self_attn.o_proj",
+      "model.layers.29.self_attn.o_proj",
+      "model.layers.30.self_attn.o_proj",
+      "model.layers.31.self_attn.o_proj",
+      "model.layers.32.self_attn.o_proj",
+      "model.layers.33.self_attn.o_proj",
+      "model.layers.34.self_attn.o_proj",
+      "model.layers.35.self_attn.o_proj",
+      "model.layers.36.self_attn.o_proj",
+      "model.layers.37.self_attn.o_proj",
+      "model.layers.38.self_attn.o_proj",
+      "model.layers.39.self_attn.o_proj",
+      "model.layers.40.self_attn.o_proj",
+      "model.layers.41.self_attn.o_proj",
+      "model.layers.42.self_attn.o_proj",
+      "model.layers.43.self_attn.o_proj",
+      "model.layers.44.self_attn.o_proj",
+      "model.layers.45.self_attn.o_proj",
+      "model.layers.46.self_attn.o_proj",
+      "model.layers.47.self_attn.o_proj",
+      "model.decoder.self_attn.o_proj"
+    ],
+    "weight_block_size": [
+      128,
+      128
+    ]
+  },
+  "attention_dropout": 0.0,
+  "attention_value_scale": 0.707,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 16384,
+  "max_position_embeddings": 262144,
+  "model_type": "mimo_v2_flash",
+  "num_attention_heads": 64,
+  "head_dim": 192,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 4,
+  "layernorm_epsilon": 1e-05,
+  "rope_theta": 5000000,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "vocab_size": 152576,
+  "partial_rotary_factor": 0.334,
+  "sliding_window": 128,
+  "swa_rope_theta": 10000,
+  "attention_bias": false,
+  "v_head_dim": 128,
+  "hybrid_layer_pattern": [
+    0,
+    1,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    0
+  ],
+  "add_swa_attention_sink_bias": true,
+  "add_full_attention_sink_bias": false,
+  "sliding_window_size": 128,
+  "attention_chunk_size": 128,
+  "moe_layer_freq": [
+    0,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "moe_intermediate_size": 2048,
+  "n_routed_experts": 256,
+  "n_shared_experts": null,
+  "num_experts_per_tok": 8,
+  "norm_topk_prob": true,
+  "scoring_func": "sigmoid",
+  "n_group": 1,
+  "topk_group": 1,
+  "topk_method": "noaux_tc",
+  "routed_scaling_factor": null,
+  "swa_num_attention_heads": 64,
+  "swa_num_key_value_heads": 8,
+  "swa_head_dim": 192,
+  "swa_v_head_dim": 128
+}

configuration_mimo_v2_flash.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# coding=utf-8
+#
+# Copyright 2025 Xiaomi Corporation.
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MiMoV2FlashConfig(PretrainedConfig):
+    model_type = ""
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Hybrid`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    attribute_map = {
+        "num_local_experts": "n_routed_experts",
+    }
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        layernorm_epsilon=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_dropout=0.0,
+        hybrid_block_size=None,
+        hybrid_layer_pattern=None,
+        partial_rotary_factor=1.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layernorm_epsilon = layernorm_epsilon
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        if hybrid_block_size is not None and hybrid_layer_pattern is None:
+            hybrid_layer_pattern = [0 if ((i + 1) % hybrid_block_size == 0) else 1 for i in range(num_hidden_layers)]
+        self.hybrid_block_size = hybrid_block_size
+        self.hybrid_layer_pattern = hybrid_layer_pattern
+        self.partial_rotary_factor = partial_rotary_factor
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": false,
+  "eos_token_id": 151643,
+  "max_new_tokens": 2048,
+  "transformers_version": "4.37.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model_10_linear_fc2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c86c4c2b77ffbae6bef2fc99864c5e3b295eac488c4464424b73a203475d8d8e
+size 2148072376

model_11_linear_fc1-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ee8ca7cb9f2a6f7ffdfba1ae49cf62ed1e6ba06301ce9bf9db2165e6184384e
+size 2500489784

model_11_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f94dbf87c03eba4e348313d8fb0111d306ecfe592e827b81a375d457a49a29e0
+size 1795653952

model_12_linear_fc1-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5748a678e39f314baf1ab6dda46eaad76fb732f4fd30f95a947e5ca11b59e93b
+size 2500489784

model_22_linear_fc1-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2098e5b2387711b3448a7cad0f0eab6de2a6a448f09e4cf78b3d06f0e02a9e36
+size 2500489784

model_26_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10f39076f966fd82459df03508fd0dccb1e00b11174943f2920f12cb9a5ad968
+size 1795653952

model_29_linear_fc2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14c00e7dc128fa80c070c33ba04f54f38da51694f14ffe1debd55a9e6ea46813
+size 2148072376

model_2_linear_fc1-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2bd21d87beedc0ffe07aa0d1e5c68c24cb8b157bc4e9da9431c054cae5c3ae0
+size 2500489184

model_36_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3eccb80354a473eccd455f02f6dc407c3c5945f185c96bea860a3b35ad322ba0
+size 1795653952

model_38_linear_fc2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6a48be8d516030e63a0bfb5d34f848e7c6bd3db9b559f81ae0f129e67da0c26
+size 2148072376

model_3_linear_fc1-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3ac9175ba9cdc60bf7e1dd0784b5370ea4cf0f421f70ed6daed70a55f335998
+size 2500489184

model_3_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87763c74c1cad9e2a7bffbd73814b3410b32d0008910ff26626c545fffa46960
+size 1795653520

model_40_linear_fc1-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a6d5c7a278b685239de081e35c45ffe74db6dc70533919a6b4b3e040f42b7a7
+size 2500489784

model_40_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:342f36afd9afdedcd5c1d52449d4dab6717ad4cf378d4611f87351d93f0a3110
+size 1795653952

model_41.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11f9f838453744e4dbbe94c587abf0cde483f987b4b5e886037ab5102e8c3be7
+size 126910184

model_42_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4bfa9e7d36f996cc86d16513d1402b334815a68a14950ae7b7d74635d93aced
+size 1795653952

model_42_linear_fc2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3eb2b03839fad606cab77af35204bcd92e3f346f4207dc44d04afad4ed6bf65b
+size 2148072376

model_43_linear_fc1-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2fa36c60f050d296984b6696e0877d9054019c2062065115ce3845af0189f7f
+size 2500489784

model_44_linear_fc1-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:931cfb406ae5959f38e4ac48fde9981f4807c64a888ab0432ec715b1bbe6faec
+size 2500489784

model_44_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5e84fb450991b3a80214a80f8d8178cb541bb945bbffc33cb4a2363d72a52c8
+size 1795653952

model_45_linear_fc2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1805e4735b2a3d6c784d3b8d84ba2dcfe9e8d69bf2a9f1e9457466a3fe49f69
+size 2148072376

model_46.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b21b85b45a46bc5794884b83e849e04641b65374d24005e99e329d92309e71a
+size 132154328

model_47.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c87154c960a93aa9671df4f43c5c29805c1675aed77b9b58624942a5ae7a5e70
+size 126910184

model_4_linear_fc1-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47f55b72a29f9b6540073f7abe5f62924e587f9f28beb789abbac0ca994cb91a
+size 2500489184

model_4_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27dfa25ee9ea11459615e4b2e868e688fd8586bf860aff6608e21d0d4ad6ee91
+size 1795653520

model_5.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3daf97ed1ce75c0791133080f303be6eba57440ad05b912b3495dc6bd3f7941c
+size 126910168

model_6.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:656fda23fd3175c1837dec5dcc1439dcc04f5b26be5f90f470ebe90cd02b01d0
+size 132154312

model_6_linear_fc1-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e9473226ad20a7fc3ebac8c56115dc92feeb3bcc271550733bd88addb132621
+size 2500489184

model_6_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eecb9becd85b8996ccf5e9e683faea1a49b3d2337aabc47442a015ce038fd9cf
+size 1795653520

model_6_linear_fc2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1a47f1f6789763449e7aa906e385624494258e33ddcfd72d7e1f68ec3ea49c0
+size 2148071864

model_7.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:216079e8c45ece3d2470e08e3bb12eb8ef417d996f5cae6be9686ef6550ba15e
+size 132154312

model_7_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:621a7bc26963664a6c15adfff7c194e3a04855f3b9dd6baaffa92e7875a63c7b
+size 1795653520

model_7_linear_fc2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9adcd21128b0f5dd1cf520b3852e99782e1d369174b6a2df871f2ae8fdfde366
+size 2148071864

model_8.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6881f17d7ae28a576485d5e6ec3cf8aff63fe643aa05774a2c269f43ce32fc04
+size 132154312

model_8_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de18140dc3eafc13b4edb9db81e325630b89501c55928b365a64911ef0501dde
+size 1795653520

model_8_linear_fc2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a9ac0504e80a5a44cd470ce07a6a59a0f4483340ad047072fa26543d4e6e1ed
+size 2148071864

model_9.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:964cad799947fbb467a4b35f90f3a155c1aaac69be4f0f8ea1deebd80826ea07
+size 132154312

model_9_linear_fc1-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d757e75acea1ad74d120917ada05052b35239f26ce3323dac2347c0759f15ef
+size 1795653520

model_9_linear_fc2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e9cc7caf5643a98d68a44ddc35506d8526651a4ed866d97d65d0fd07430efbf
+size 2148071864

model_embedding.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02a1bdc6589dff3bbe5ef9e2a77af02739c105c1e4d94ccc850b3e115b6e9e54
+size 1249902704

model_final.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1801cb96017ff809b18b896b8b2625336d9bf0f5f2937f200b1db4fa0626a7cd
+size 1249910976

modeling_mimo_v2_flash.py ADDED Viewed

	@@ -0,0 +1,664 @@

+# coding=utf-8
+#
+# Copyright 2025 Xiaomi Corporation.
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.generation import GenerationMixin
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import (
+    logging,
+)
+from transformers.modeling_outputs import MoeModelOutputWithPast
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
+from .configuration_mimo_v2_flash import MiMoV2FlashConfig
+logger = logging.get_logger(__name__)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    sinks: Optional[torch.Tensor] = None,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    if sinks is not None:
+        sinks = module.attention_sink_bias.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1)
+        attn_weights = torch.cat([attn_weights, sinks], dim=-1)
+    attn_weights = attn_weights - attn_weights.max(dim=-1, keepdim=True).values
+    probs = F.softmax(attn_weights, dim=-1, dtype=attn_weights.dtype)
+    if sinks is not None:
+        probs = probs[..., :-1]  # we drop the sink here
+    attn_weights = nn.functional.dropout(probs, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+@use_kernel_forward_from_hub("RMSNorm")
+class MiMoV2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+            MiMoV2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class MiMoV2MLP(nn.Module):
+    """MiMoV2MLP matching the gate, up, and down projection layers."""
+    def __init__(self, config: MiMoV2FlashConfig, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_states):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
+        return down_proj
+class MiMoV2MoEGate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = (
+            config.routed_scaling_factor
+            if config.routed_scaling_factor is not None
+            else 1.0
+        )
+        self.scoring_func = config.scoring_func
+        self.topk_method = config.topk_method
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        # topk selection algorithm
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(
+            torch.empty((self.n_routed_experts, self.gating_dim))
+        )
+        if self.topk_method == "noaux_tc":
+            self.e_score_correction_bias = nn.Parameter(
+                torch.empty((self.n_routed_experts))
+            )
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        ### compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(
+            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
+        )
+        if self.scoring_func == "sigmoid":
+            scores = logits.sigmoid()
+        else:
+            raise NotImplementedError(
+                f"insupportable scoring function for MoE gating: {self.scoring_func}"
+            )
+        ### select top-k experts
+        if self.topk_method == "noaux_tc":
+            assert not self.training
+            scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
+            group_scores = (
+                scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1)
+            )  # [n, n_group]
+            group_idx = torch.topk(
+                group_scores, k=self.topk_group, dim=-1, sorted=False
+            )[
+                1
+            ]  # [n, top_k_group]
+            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+            score_mask = (
+                group_mask.unsqueeze(-1)
+                .expand(
+                    bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
+                )
+                .reshape(bsz * seq_len, -1)
+            )  # [n, e]
+            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
+            _, topk_idx = torch.topk(
+                tmp_scores, k=self.top_k, dim=-1, sorted=False
+            )
+            topk_weight = scores.gather(1, topk_idx)
+        else:
+            raise NotImplementedError(
+                f"insupportable TopK function for MoE gating: {self.topk_method}"
+            )
+        ### norm gate to sum 1
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor
+        return topk_idx, topk_weight
+class MiMoV2MoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList(
+            [
+                MiMoV2MLP(config, intermediate_size=config.moe_intermediate_size)
+                for _ in range(config.n_routed_experts)
+            ]
+        )
+        self.gate = MiMoV2MoEGate(config)
+    def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+        r"""
+        CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
+        to not have to do a loop here (deepseek has 256 experts soooo yeah).
+        """
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
+        expert_mask = expert_mask.permute(2, 0, 1)
+        for expert_idx in range(len(self.experts)):
+            expert = self.experts[expert_idx]
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+            if token_indices.numel() > 0:
+                expert_weights = topk_weights[token_indices, weight_indices]
+                expert_input = hidden_states[token_indices]
+                expert_output = expert(expert_input)
+                weighted_output = expert_output * expert_weights.unsqueeze(-1)
+                final_hidden_states.index_add_(0, token_indices, weighted_output)
+        # in original deepseek, the output of the experts are gathered once we leave this module
+        # thus the moe module is itelsf an IsolatedParallel module
+        # and all expert are "local" meaning we shard but we don't gather
+        return final_hidden_states.type(hidden_states.dtype)
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+        return hidden_states
+class MiMoV2Attention(nn.Module):
+    """MiMoV2 Global Attention (pattern == 0) and Sliding Window Attention (pattern == 1)."""
+    def __init__(self, config: MiMoV2FlashConfig, is_swa: bool, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if is_swa:
+            self.head_dim = config.swa_head_dim
+            self.v_head_dim = config.swa_v_head_dim
+            self.num_attention_heads = config.swa_num_attention_heads
+            self.num_key_value_heads = config.swa_num_key_value_heads
+        else:
+            self.head_dim = config.head_dim
+            self.v_head_dim = config.v_head_dim
+            self.num_attention_heads = config.num_attention_heads
+            self.num_key_value_heads = config.num_key_value_heads
+        self.rope_dim = int(self.head_dim * config.partial_rotary_factor)
+        self.num_key_value_groups = self.num_attention_heads // self.num_key_value_heads
+        self.attention_bias = config.attention_bias
+        self.attention_dropout: float = config.attention_dropout
+        self.scaling = self.head_dim ** -0.5
+        # These dimensions are for the attention layers
+        q_hidden_size = self.num_attention_heads * self.head_dim
+        k_hidden_size = self.num_key_value_heads * self.head_dim
+        v_hidden_size = self.num_key_value_heads * self.v_head_dim
+        o_hidden_size = self.num_attention_heads * self.v_head_dim
+        self.q_proj = nn.Linear(config.hidden_size, q_hidden_size, bias=self.attention_bias)
+        self.k_proj = nn.Linear(config.hidden_size, k_hidden_size, bias=self.attention_bias)
+        self.v_proj = nn.Linear(config.hidden_size, v_hidden_size, bias=self.attention_bias)
+        self.o_proj = nn.Linear(o_hidden_size, config.hidden_size, bias=False)
+        self.attention_sink_bias = (
+            torch.nn.Parameter(torch.empty(config.num_attention_heads), requires_grad=False)
+            if (config.add_full_attention_sink_bias and not is_swa) or (config.add_swa_attention_sink_bias and is_swa)
+            else None
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        qk_hidden_shape = (*input_shape, -1, self.head_dim)
+        v_hidden_shape = (*input_shape, -1, self.v_head_dim)
+        query_states = self.q_proj(hidden_states).view(qk_hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(qk_hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(v_hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_rope, query_nope = query_states.split([self.rope_dim, self.head_dim - self.rope_dim], dim=-1)
+        key_rope, key_nope = key_states.split([self.rope_dim, self.head_dim - self.rope_dim], dim=-1)
+        query_rope, key_rope = apply_rotary_pos_emb(query_rope, key_rope, cos, sin)
+        query_states = torch.cat([query_rope, query_nope], dim=-1)
+        key_states = torch.cat([key_rope, key_nope], dim=-1)
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            position_ids=position_ids,
+            sinks=self.attention_sink_bias,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class MiMoV2DecoderLayer(nn.Module):
+    """
+    MiMoV2 Decoder Layer. It dynamically chooses the correct attention
+    module based on the layer index and the `hybrid_layer_pattern`.
+    """
+    def __init__(self, config: MiMoV2FlashConfig, layer_idx: int):
+        super().__init__()
+        # This is the key logic: choose the module based on the pattern
+        is_swa_layer = config.hybrid_layer_pattern[layer_idx] == 1
+        if is_swa_layer:
+            self.attention_type = "sliding_window_attention"
+            self.self_attn = MiMoV2Attention(config, True, layer_idx)
+        else:
+            self.attention_type = "full_attention"
+            self.self_attn = MiMoV2Attention(config, False, layer_idx)
+        self.mlp = (
+            MiMoV2MoE(config)
+            if (
+                    getattr(config, 'n_routed_experts', None) is not None
+                    and config.moe_layer_freq[layer_idx]
+            )
+            else MiMoV2MLP(config)
+        )
+        self.input_layernorm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.hidden_size = config.hidden_size
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # MLP or MOE
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class MiMoV2FlashRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: MiMoV2FlashConfig, is_swa, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        if is_swa:
+            self.config.rope_theta = config.swa_rope_theta
+            self.config.head_dim = config.swa_head_dim
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+@auto_docstring
+class MiMoV2Model(PreTrainedModel):
+    """The main 'model' block, corresponding to `model.` in the weight map."""
+    config_class = MiMoV2FlashConfig
+    def __init__(self, config: MiMoV2FlashConfig):
+        super().__init__(config)
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [MiMoV2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.rotary_emb = MiMoV2FlashRotaryEmbedding(config=config, is_swa=False)
+        self.swa_rotary_emb = MiMoV2FlashRotaryEmbedding(config=config, is_swa=True)
+        self.has_sliding_layers = any(
+            pattern == 1 for pattern in config.hybrid_layer_pattern
+        )
+        # For Huggingface DynamicCache compatibility
+        self.config.layer_types = [
+            "sliding_attention" if config.hybrid_layer_pattern[i] == 1 else "full_attention"
+            for i in range(config.num_hidden_layers)
+        ]
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            # The sliding window alternating layers are not always activated depending on the config
+            if self.has_sliding_layers:
+                causal_mask_mapping["sliding_window_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        swa_position_embeddings = self.swa_rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_embeddings=(
+                    position_embeddings
+                    if decoder_layer.attention_type == "full_attention"
+                    else swa_position_embeddings
+                ),
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+@auto_docstring
+class MiMoV2FlashForCausalLM(PreTrainedModel,GenerationMixin):
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    config_class = MiMoV2FlashConfig
+    _keys_to_ignore_on_load_unexpected = [r"model.layers\.\d+\.self_attn\.rotary_emb\.inv_freq"]
+    def __init__(self, config: MiMoV2FlashConfig):
+        super().__init__(config)
+        self.model = MiMoV2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "MiMoV2FlashForCausalLM"
+]

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff