use tt moe

Browse files

Files changed (3) hide show

config.json +5 -0
configuration_glm4_moe.py +2 -2
modeling_glm4_moe.py +29 -15

config.json CHANGED Viewed

@@ -2,6 +2,11 @@
   "architectures": [
     "Glm4MoeForCausalLM"
   ],
   "attention_bias": true,
   "attention_dropout": 0.0,
   "pad_token_id": 151329,

   "architectures": [
     "Glm4MoeForCausalLM"
   ],
+  "auto_map": {
+    "AutoConfig": "configuration_glm4_moe.Glm4MoeConfig",
+    "AutoModelForCausalLM": "modeling_glm4_moe.Glm4MoeForCausalLM",
+    "AutoModel": "modeling_glm4_moe.Glm4MoeModel"
+  },
   "attention_bias": true,
   "attention_dropout": 0.0,
   "pad_token_id": 151329,

configuration_glm4_moe.py CHANGED Viewed

@@ -19,8 +19,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ...configuration_utils import PretrainedConfig
-from ...modeling_rope_utils import rope_config_validation
 class Glm4MoeConfig(PretrainedConfig):

 # See the License for the specific language governing permissions and
 # limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
 class Glm4MoeConfig(PretrainedConfig):

modeling_glm4_moe.py CHANGED Viewed

@@ -25,22 +25,24 @@ import torch
 import torch.nn.functional as F
 from torch import nn
-from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache
-from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub
-from ...masking_utils import create_causal_mask
-from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_layers import GradientCheckpointingLayer
-from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
-from ...utils.deprecation import deprecate_kwarg
-from ...utils.generic import check_model_inputs
 from .configuration_glm4_moe import Glm4MoeConfig
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
@@ -354,8 +356,20 @@ class Glm4MoeDecoderLayer(GradientCheckpointingLayer):
         self.self_attn = Glm4MoeAttention(config=config, layer_idx=layer_idx)
         if layer_idx >= config.first_k_dense_replace:
-            self.mlp = Glm4MoeMoE(config)
         else:
             self.mlp = Glm4MoeMLP(config)

 import torch.nn.functional as F
 from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
+from transformers.utils.deprecation import deprecate_kwarg
+from transformers.utils.generic import check_model_inputs
 from .configuration_glm4_moe import Glm4MoeConfig
+from torchtitan.models.moe import MoE, MoEArgs
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
         self.self_attn = Glm4MoeAttention(config=config, layer_idx=layer_idx)
+        moe_args = MoEArgs(
+            num_experts=config.n_routed_experts,
+            num_shared_experts=config.n_shared_experts,
+            score_func="sigmoid",
+            route_norm=config.norm_topk_prob,
+            route_scale=config.routed_scaling_factor,
+            score_before_experts=False,
+            top_k=config.num_experts_per_tok,
+            use_grouped_mm=True,
+            load_balance_coeff=1e-3,
+        )
         if layer_idx >= config.first_k_dense_replace:
+            self.mlp = MoE(moe_args, dim=config.hidden_size, hidden_dim=config.moe_intermediate_size)
         else:
             self.mlp = Glm4MoeMLP(config)