Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

README.md +52 -0
config.json +38 -0
configuration_mimo.py +16 -0
generation_config.json +7 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_mimo.py +74 -0
tokenizer.json +0 -0
tokenizer_config.json +207 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,52 @@

+---
+license: mit
+library_name: transformers
+---
+<div align="center">
+  <picture>
+    <source srcset="https://github.com/XiaomiMiMo/MiMo/raw/main/figures/Xiaomi_MiMo_darkmode.png?raw=true" media="(prefers-color-scheme: dark)">
+    <img src="https://github.com/XiaomiMiMo/MiMo/raw/main/figures/Xiaomi_MiMo.png?raw=true" width="60%" alt="Xiaomi-MiMo" />
+  </picture>
+</div>
+<h3 align="center">
+  <b>
+    <span>━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span>
+    <br/>
+    Unlocking the Reasoning Potential of Language Model<br/>From Pretraining to Posttraining
+    <br/>
+    <span>━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span>
+    <br/>
+  </b>
+</h3>
+<br/>
+<div align="center" style="line-height: 1;">
+  |
+  <a href="https://huggingface.co/XiaomiMiMo" target="_blank">🤗 HuggingFace</a>
+  &nbsp;|
+  <a href="https://www.modelscope.cn/organization/XiaomiMiMo" target="_blank">🤖️ ModelScope</a>
+  &nbsp;|
+  <a href="https://arxiv.org/abs/2505.07608" target="_blank">📔 Technical Report</a>
+  &nbsp;|
+  <br/>
+</div>
+<br/>
+> This model repository is licensed under the MIT License.
+## I. Pretrained MTPs of MiMo-7B
+This model repository contains the pretrained MTP weights of MiMo-7B (`model.mtp_layers.1` and `model.mtp_layers.2`)
+Currently, MiMo-7B model each has 1 MTP layer (`model.mtp_layers.0`). Users may load the weights of pretrained MTPs for potential performance gains (please refer to *[Power Up Speculative Decoding In Reinforcement Learning](https://www.notion.so/jiajunli-guapisolo/Power-Up-Speculative-Decoding-In-Reinforcement-Learning-2a92d24a293b802d9c73dbae429e581e)*).
+> [!IMPORTANT]
+> We tuned 1 MTP layer in SFT and freeze it in RL, and we **HAVE NOT** test the performance of posttrained models with 2 more pretrained MTP layers.
+## II. Contact
+Please contact us at [mimo@xiaomi.com](mailto:mimo@xiaomi.com) or open an issue if you have any questions.

config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "architectures": [
+        "MiMoMTPModel"
+    ],
+    "tokenizer_class":[
+        "Qwen2Tokenizer"
+    ],
+    "auto_map": {
+        "AutoConfig": "configuration_mimo.MiMoConfig",
+        "AutoModel": "modeling_mimo.MiMoMTPModel"
+    },
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 36,
+    "model_type": "mimo",
+    "num_attention_heads": 32,
+    "head_dim": 128,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-05,
+    "rope_theta": 640000,
+    "sliding_window": 32768,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.40.1",
+    "use_cache": true,
+    "use_mrope": false,
+    "use_sliding_window": false,
+    "vocab_size": 151680,
+    "attention_bias": true,
+    "num_nextn_predict_layers": 3
+}

configuration_mimo.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+class MiMoConfig(Qwen2Config):
+    model_type = "mimo"
+    def __init__(
+        self,
+        *args,
+        num_nextn_predict_layers=0,
+        **kwargs
+    ):
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+        super().__init__(
+            *args,
+            **kwargs,
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": false,
+  "eos_token_id": 151645,
+  "max_new_tokens": 2048,
+  "transformers_version": "4.37.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac1f4a0260d41f60ddd273118268c89270304de2283848ae831cc4738d34c91f
+size 843165272

modeling_mimo.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from typing import Optional, Tuple
+import torch
+from torch import nn
+from transformers.cache_utils import Cache
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.qwen2.modeling_qwen2 import (Qwen2Attention,
+                                                      Qwen2MLP,
+                                                      Qwen2RMSNorm)
+from .configuration_mimo import MiMoConfig
+class MiMoMTPLayers(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.token_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hidden_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
+        self.final_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.self_attn = Qwen2Attention(config, layer_idx=0)
+        self.mlp = Qwen2MLP(config)
+    def forward(self, input_embeds,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values: Optional[Cache]=None,
+                    output_attentions: Optional[bool]=False,
+                    use_cache: Optional[bool]=False,
+                    position_embedding: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+                    cache_position=None,
+                    **kwargs):
+        input_embeds = self.token_layernorm(input_embeds)
+        previous_hidden_states = self.hidden_layernorm(hidden_states)
+        hidden_states = self.input_proj(torch.cat([previous_hidden_states, input_embeds], dim=-1))
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, _ = self.self_attn(hidden_states,
+                                       attention_mask=attention_mask,
+                                       position_ids=position_ids,
+                                       past_key_values=past_key_values,
+                                       output_attentions=output_attentions,
+                                       use_cache=use_cache,
+                                       cache_position=cache_position,
+                                       position_embedding=position_embedding,
+                                       **kwargs)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+class MiMoMTPBlock(PreTrainedModel):
+    config_class = MiMoConfig
+    def __init__(self, config: MiMoConfig):
+        super().__init__(config)
+        self.mtp_layers = nn.ModuleList(
+            [nn.Identity()] + \
+            [MiMoMTPLayers(config) for _ in range(config.num_nextn_predict_layers - 1)]
+        )
+class MiMoMTPModel(PreTrainedModel):
+    config_class = MiMoConfig
+    def __init__(self, config: MiMoConfig):
+        super().__init__(config)
+        self.model = MiMoMTPBlock(config)

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff