Motif-Technologies
/

Motif-2.6B

@@ -608,7 +608,7 @@ class MotifFlashAttention2(MotifAttention):
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in float16 just to be sure everything works as expected.
         input_dtype = query_states.dtype
-        if input_dtype == torch.float32 and MorehFlashAttention is None:
             if torch.is_autocast_enabled():
                 target_dtype = torch.get_autocast_gpu_dtype()
             # Handle the case where the model is quantized
@@ -635,7 +635,7 @@ class MotifFlashAttention2(MotifAttention):
         value_states = value_states.transpose(1, 2)
         if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None
-                and self.layer_idx >= self.config.max_window_layers and MorehFlashAttention is None):
             sliding_window = self.config.sliding_window
         else:
             sliding_window = None
@@ -789,8 +789,7 @@ class MotifDecoderLayer(nn.Module):
     def __init__(self, config: MotifConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        if config.use_moreh_attention:
-            config._attn_implementation = "flash_attention_2"
         if config.sliding_window and config._attn_implementation != "flash_attention_2":
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
@@ -801,7 +800,7 @@ class MotifDecoderLayer(nn.Module):
             self.self_attn = MOTIF_ATTENTION_CLASSES["eager"](config, layer_idx)
         self.mlp = MotifMLP(config)
-        RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -1055,7 +1054,7 @@ class MotifModel(MotifPreTrainedModel):
             MotifDecoderLayer(config = config, layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)
         ])
         self._attn_implementation = config._attn_implementation
-        RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads

         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in float16 just to be sure everything works as expected.
         input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
             if torch.is_autocast_enabled():
                 target_dtype = torch.get_autocast_gpu_dtype()
             # Handle the case where the model is quantized
         value_states = value_states.transpose(1, 2)
         if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None
+                and self.layer_idx >= self.config.max_window_layers):
             sliding_window = self.config.sliding_window
         else:
             sliding_window = None
     def __init__(self, config: MotifConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         if config.sliding_window and config._attn_implementation != "flash_attention_2":
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
             self.self_attn = MOTIF_ATTENTION_CLASSES["eager"](config, layer_idx)
         self.mlp = MotifMLP(config)
+        RMSNorm = MotifRMSNorm
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
             MotifDecoderLayer(config = config, layer_idx=layer_idx) for layer_idx in range(num_hidden_layers)
         ])
         self._attn_implementation = config._attn_implementation
+        RMSNorm = MotifRMSNorm
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads