minor

Files changed (4) hide show

README.md +4 -4
config.json +4 -4
configuration_supersparsemixtral.py → configuration_turbosparsemixtral.py +2 -2
modeling_supersparsemixtral.py → modeling_turbosparsemixtral.py +33 -33

README.md CHANGED Viewed

@@ -4,8 +4,8 @@ language:
 - en
 ---
-# Model Card for SuperSparse-Mixtral
-The SuperSparse-Mixtral Large Language Model (LLM) is an sparsified version of the Mixtral.
 <img src="takeaway.png" alt="avatar" width="300" height="200"/>
@@ -13,7 +13,7 @@ The average performance is evaluated using benchmarks from the OpenLLM Leaderboa
 ## Inference
-Our code for accelerating SuperSparse-Mixtral is currently being refined. Stay tuned! Now you can run this model like dense model.
 ## Chat-Template
@@ -25,7 +25,7 @@ We take ChatML as our chat template:
 ## Allow Finetuning
-As we merged the predictors for FFN neurons in models, you can finetune SuperSparse-Mixtral with any framework and algorithm.
 ## License

 - en
 ---
+# Model Card for TurboSparse-Mixtral
+The TurboSparse-Mixtral Large Language Model (LLM) is an sparsified version of the Mixtral.
 <img src="takeaway.png" alt="avatar" width="300" height="200"/>
 ## Inference
+Our code for accelerating TurboSparse-Mixtral is currently being refined. Stay tuned! Now you can run this model like dense model.
 ## Chat-Template
 ## Allow Finetuning
+As we merged the predictors for FFN neurons in models, you can finetune TurboSparse-Mixtral with any framework and algorithm.
 ## License

config.json CHANGED Viewed

@@ -3,9 +3,9 @@
     "TurboSparseMixtralForCausalLM"
   ],
   "auto_map": {
-      "AutoConfig": "configuration_supersparsemixtral.SuperSparseMixtralConfig",
-      "AutoModel": "modeling_supersparsemixtral.SuperSparseMixtralForCausalLM",
-      "AutoModelForCausalLM": "modeling_supersparsemixtral.SuperSparseMixtralForCausalLM"
   },
   "attention_dropout": 0.0,
   "bos_token_id": 1,
@@ -15,7 +15,7 @@
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
-  "model_type": "trubosparsemixtral",
   "num_attention_heads": 32,
   "num_experts_per_tok": 2,
   "num_hidden_layers": 32,

     "TurboSparseMixtralForCausalLM"
   ],
   "auto_map": {
+      "AutoConfig": "configuration_turbosparsemixtral.TurboSparseMixtralConfig",
+      "AutoModel": "modeling_turbosparsemixtral.TurboSparseMixtralForCausalLM",
+      "AutoModelForCausalLM": "modeling_turbosparsemixtral.TurboSparseMixtralForCausalLM"
   },
   "attention_dropout": 0.0,
   "bos_token_id": 1,
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
+  "model_type": "turbosparsemixtral",
   "num_attention_heads": 32,
   "num_experts_per_tok": 2,
   "num_hidden_layers": 32,

configuration_supersparsemixtral.py → configuration_turbosparsemixtral.py RENAMED Viewed

@@ -22,7 +22,7 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
-class SuperSparseMixtralConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`MixtralModel`]. It is used to instantiate an
     Mixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -106,7 +106,7 @@ class SuperSparseMixtralConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
-    model_type = "mixtral"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

 logger = logging.get_logger(__name__)
+class TurboSparseMixtralConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`MixtralModel`]. It is used to instantiate an
     Mixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
     >>> configuration = model.config
     ```"""
+    model_type = "turbosparsemixtral"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

modeling_supersparsemixtral.py → modeling_turbosparsemixtral.py RENAMED Viewed

@@ -54,7 +54,7 @@ from transformers.utils import (
     replace_return_docstrings,
     is_torch_fx_available,
 )
-from .configuration_supersparsemixtral import SuperSparseMixtralConfig
 @dataclass
 class AttentionMaskConverter:
     """
@@ -634,7 +634,7 @@ def _get_unpad_data(attention_mask):
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mixtral
-class SuperSparseMixtralRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
         MixtralRMSNorm is equivalent to T5LayerNorm
@@ -653,7 +653,7 @@ class SuperSparseMixtralRMSNorm(nn.Module):
 # copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mixtral
 # TODO @longjie no longer copied from Mistral after static cache
-class SuperSparseMixtralRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -742,13 +742,13 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 # copied from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Mixtral
 # TODO @longjie no longer copied from Mistral after static cache
-class SuperSparseMixtralAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
     and "Generating Long Sequences with Sparse Transformers".
     """
-    def __init__(self, config: SuperSparseMixtralConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -779,7 +779,7 @@ class SuperSparseMixtralAttention(nn.Module):
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.rotary_emb = SuperSparseMixtralRotaryEmbedding(
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
@@ -867,7 +867,7 @@ class SuperSparseMixtralAttention(nn.Module):
 # copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Mixtral
 # TODO @longjie no longer copied from Mistral after static cache
-class SuperSparseMixtralFlashAttention2(SuperSparseMixtralAttention):
     """
     Mixtral flash attention module. This module inherits from `MixtralAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
@@ -1154,7 +1154,7 @@ class SuperSparseMixtralFlashAttention2(SuperSparseMixtralAttention):
 # copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Mixtral
 # TODO @longjie no longer copied from Mistral after static cache
-class SuperSparseMixtralSdpaAttention(SuperSparseMixtralAttention):
     """
     Mixtral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
     `MixtralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
@@ -1246,9 +1246,9 @@ class SuperSparseMixtralSdpaAttention(SuperSparseMixtralAttention):
 MIXTRAL_ATTENTION_CLASSES = {
-    "eager": SuperSparseMixtralAttention,
-    "flash_attention_2": SuperSparseMixtralFlashAttention2,
-    "sdpa": SuperSparseMixtralSdpaAttention,
 }
 class MLP(nn.Module):
@@ -1264,8 +1264,8 @@ class MLP(nn.Module):
         x = self.fc2(x)
         x = x.sigmoid()
         return x
-class SuperSparseMixtralBlockSparseTop2MLP(nn.Module):
-    def __init__(self, config: SuperSparseMixtralConfig, layer_id):
         super().__init__()
         self.ffn_dim = config.intermediate_size
         self.hidden_dim = config.hidden_size
@@ -1288,7 +1288,7 @@ class SuperSparseMixtralBlockSparseTop2MLP(nn.Module):
         return current_hidden_states
-class SuperSparseMixtralSparseMoeBlock(nn.Module):
     """
     This implementation is
     strictly equivalent to standard MoE with full capacity (no
@@ -1310,7 +1310,7 @@ class SuperSparseMixtralSparseMoeBlock(nn.Module):
         # gating
         self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
-        self.experts = nn.ModuleList([SuperSparseMixtralBlockSparseTop2MLP(config, layer_id) for _ in range(self.num_experts)])
         # Jitter parameters
         self.jitter_noise = config.router_jitter_noise
@@ -1356,16 +1356,16 @@ class SuperSparseMixtralSparseMoeBlock(nn.Module):
         return final_hidden_states, router_logits
-class SuperSparseMixtralDecoderLayer(nn.Module):
-    def __init__(self, config: SuperSparseMixtralConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = MIXTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.block_sparse_moe = SuperSparseMixtralSparseMoeBlock(config, layer_idx)
-        self.input_layernorm = SuperSparseMixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = SuperSparseMixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
@@ -1451,11 +1451,11 @@ MIXTRAL_START_DOCSTRING = r"""
     MIXTRAL_START_DOCSTRING,
 )
 # Copied from transformers.models.qwen2.modeling_qwen2.Qwen2PreTrainedModel with Qwen2->Mixtral
-class SuperSparseMixtralPreTrainedModel(PreTrainedModel):
-    config_class = SuperSparseMixtralConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["SuperSparseMixtralDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
@@ -1546,7 +1546,7 @@ MIXTRAL_INPUTS_DOCSTRING = r"""
 )
 # copied from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->MIXTRAL,Mistral->Mixtral
 # TODO @longjie no longer copied from Mistral after static cache
-class SuperSparseMixtralModel(SuperSparseMixtralPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`]
@@ -1554,17 +1554,17 @@ class SuperSparseMixtralModel(SuperSparseMixtralPreTrainedModel):
         config: MixtralConfig
     """
-    def __init__(self, config: SuperSparseMixtralConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
-            [SuperSparseMixtralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
-        self.norm = SuperSparseMixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1741,12 +1741,12 @@ class SuperSparseMixtralModel(SuperSparseMixtralPreTrainedModel):
         )
-class SuperSparseMixtralForCausalLM(SuperSparseMixtralPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
-        self.model = SuperSparseMixtralModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.router_aux_loss_coef = config.router_aux_loss_coef
@@ -1974,11 +1974,11 @@ class SuperSparseMixtralForCausalLM(SuperSparseMixtralPreTrainedModel):
     MIXTRAL_START_DOCSTRING,
 )
 # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mixtral, LLAMA->MIXTRAL
-class SuperSparseMixtralForSequenceClassification(SuperSparseMixtralPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = SuperSparseMixtralModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
@@ -2090,11 +2090,11 @@ class SuperSparseMixtralForSequenceClassification(SuperSparseMixtralPreTrainedMo
     MIXTRAL_START_DOCSTRING,
 )
 # Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Mixtral, LLAMA->MIXTRAL
-class SuperSparseMixtralForTokenClassification(SuperSparseMixtralPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = SuperSparseMixtralModel(config)
         if getattr(config, "classifier_dropout", None) is not None:
             classifier_dropout = config.classifier_dropout
         elif getattr(config, "hidden_dropout", None) is not None:

     replace_return_docstrings,
     is_torch_fx_available,
 )
+from .configuration_turbosparsemixtral import TurboSparseMixtralConfig
 @dataclass
 class AttentionMaskConverter:
     """
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mixtral
+class TurboSparseMixtralRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
         MixtralRMSNorm is equivalent to T5LayerNorm
 # copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mixtral
 # TODO @longjie no longer copied from Mistral after static cache
+class TurboSparseMixtralRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
 # copied from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Mixtral
 # TODO @longjie no longer copied from Mistral after static cache
+class TurboSparseMixtralAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
     and "Generating Long Sequences with Sparse Transformers".
     """
+    def __init__(self, config: TurboSparseMixtralConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = TurboSparseMixtralRotaryEmbedding(
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
 # copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Mixtral
 # TODO @longjie no longer copied from Mistral after static cache
+class TurboSparseMixtralFlashAttention2(TurboSparseMixtralAttention):
     """
     Mixtral flash attention module. This module inherits from `MixtralAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
 # copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Mixtral
 # TODO @longjie no longer copied from Mistral after static cache
+class TurboSparseMixtralSdpaAttention(TurboSparseMixtralAttention):
     """
     Mixtral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
     `MixtralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
 MIXTRAL_ATTENTION_CLASSES = {
+    "eager": TurboSparseMixtralAttention,
+    "flash_attention_2": TurboSparseMixtralFlashAttention2,
+    "sdpa": TurboSparseMixtralSdpaAttention,
 }
 class MLP(nn.Module):
         x = self.fc2(x)
         x = x.sigmoid()
         return x
+class TurboSparseMixtralBlockSparseTop2MLP(nn.Module):
+    def __init__(self, config: TurboSparseMixtralConfig, layer_id):
         super().__init__()
         self.ffn_dim = config.intermediate_size
         self.hidden_dim = config.hidden_size
         return current_hidden_states
+class TurboSparseMixtralSparseMoeBlock(nn.Module):
     """
     This implementation is
     strictly equivalent to standard MoE with full capacity (no
         # gating
         self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.experts = nn.ModuleList([TurboSparseMixtralBlockSparseTop2MLP(config, layer_id) for _ in range(self.num_experts)])
         # Jitter parameters
         self.jitter_noise = config.router_jitter_noise
         return final_hidden_states, router_logits
+class TurboSparseMixtralDecoderLayer(nn.Module):
+    def __init__(self, config: TurboSparseMixtralConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = MIXTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.block_sparse_moe = TurboSparseMixtralSparseMoeBlock(config, layer_idx)
+        self.input_layernorm = TurboSparseMixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = TurboSparseMixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
     MIXTRAL_START_DOCSTRING,
 )
 # Copied from transformers.models.qwen2.modeling_qwen2.Qwen2PreTrainedModel with Qwen2->Mixtral
+class TurboSparseMixtralPreTrainedModel(PreTrainedModel):
+    config_class = TurboSparseMixtralConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["TurboSparseMixtralDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
 )
 # copied from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->MIXTRAL,Mistral->Mixtral
 # TODO @longjie no longer copied from Mistral after static cache
+class TurboSparseMixtralModel(TurboSparseMixtralPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`]
         config: MixtralConfig
     """
+    def __init__(self, config: TurboSparseMixtralConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
+            [TurboSparseMixtralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
+        self.norm = TurboSparseMixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         )
+class TurboSparseMixtralForCausalLM(TurboSparseMixtralPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
+        self.model = TurboSparseMixtralModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.router_aux_loss_coef = config.router_aux_loss_coef
     MIXTRAL_START_DOCSTRING,
 )
 # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mixtral, LLAMA->MIXTRAL
+class TurboSparseMixtralForSequenceClassification(TurboSparseMixtralPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = TurboSparseMixtralModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
     MIXTRAL_START_DOCSTRING,
 )
 # Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Mixtral, LLAMA->MIXTRAL
+class TurboSparseMixtralForTokenClassification(TurboSparseMixtralPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = TurboSparseMixtralModel(config)
         if getattr(config, "classifier_dropout", None) is not None:
             classifier_dropout = config.classifier_dropout
         elif getattr(config, "hidden_dropout", None) is not None: