Motif-Technologies
/

Motif-2.6B

@@ -63,12 +63,10 @@ if is_flash_attn_2_available():
 try:
     moreh_ops = torch.ops.moreh
-    MorehRMSNorm = moreh_ops.T5LayerNorm
     ScaledDotProductAttention = moreh_ops.scaled_dot_product_attention
     MorehFlashAttention = moreh_ops.flash_attention
     logger.warning_once("Using moreh ops")
 except AttributeError:
-    MorehRMSNorm = None
     ScaledDotProductAttention = None
     MorehFlashAttention = None
     logger.warning_once("Failed to import moreh ops")
@@ -100,7 +98,7 @@ class MotifRMSNorm(nn.Module):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-ALL_LAYERNORM_LAYERS.append(MotifRMSNorm if MorehRMSNorm is None else MorehRMSNorm)
 class MotifRotaryEmbeddingWithCache(nn.Module):
@@ -813,7 +811,7 @@ class MotifDecoderLayer(nn.Module):
         self.self_attn = MOTIF_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.mlp = MotifMLP(config)
-        RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -1051,7 +1049,7 @@ class MotifModel(MotifPreTrainedModel):
         num_hidden_layers = config.num_hidden_layers if self.multi_token_heads is None else config.num_hidden_layers - 1
         self.layers = nn.ModuleList([MotifDecoderLayer(config = config, layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)])
         self._attn_implementation = config._attn_implementation
-        RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads

 try:
     moreh_ops = torch.ops.moreh
     ScaledDotProductAttention = moreh_ops.scaled_dot_product_attention
     MorehFlashAttention = moreh_ops.flash_attention
     logger.warning_once("Using moreh ops")
 except AttributeError:
     ScaledDotProductAttention = None
     MorehFlashAttention = None
     logger.warning_once("Failed to import moreh ops")
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+ALL_LAYERNORM_LAYERS.append(MotifRMSNorm)
 class MotifRotaryEmbeddingWithCache(nn.Module):
         self.self_attn = MOTIF_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.mlp = MotifMLP(config)
+        RMSNorm = MorehRMSNorm
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         num_hidden_layers = config.num_hidden_layers if self.multi_token_heads is None else config.num_hidden_layers - 1
         self.layers = nn.ModuleList([MotifDecoderLayer(config = config, layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)])
         self._attn_implementation = config._attn_implementation
+        RMSNorm = MorehRMSNorm
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads