Motif-Technologies
/

Motif-2.6B

Text Generation

text-generation-inference

Model card Files Files and versions

Eunhwan Park commited on Jun 9, 2025

Commit

4293a01

·

verified ·

1 Parent(s): 0ff3917

Update modeling_motif.py

Remove MorehFlashAttention

Files changed (1) hide show

modeling_motif.py +2 -6

modeling_motif.py CHANGED Viewed

@@ -63,10 +63,8 @@ if is_flash_attn_2_available():
 try:
     moreh_ops = torch.ops.moreh
-    MorehFlashAttention = moreh_ops.flash_attention
     logger.warning_once("Using moreh ops")
 except AttributeError:
-    MorehFlashAttention = None
     logger.warning_once("Failed to import moreh ops")
 #_CHECKPOINT_FOR_DOC = "moreh/Motif-102B"
@@ -597,7 +595,7 @@ class MotifFlashAttention2(MotifAttention):
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in float16 just to be sure everything works as expected.
         input_dtype = query_states.dtype
-        if input_dtype == torch.float32 and MorehFlashAttention is None:
             if torch.is_autocast_enabled():
                 target_dtype = torch.get_autocast_gpu_dtype()
             # Handle the case where the model is quantized
@@ -624,7 +622,7 @@ class MotifFlashAttention2(MotifAttention):
         value_states = value_states.transpose(1, 2)
         if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None
-                and self.layer_idx >= self.config.max_window_layers and MorehFlashAttention is None):
             sliding_window = self.config.sliding_window
         else:
             sliding_window = None
@@ -1177,8 +1175,6 @@ class MotifModel(MotifPreTrainedModel):
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
-            if MorehFlashAttention is not None:
-                return attention_mask
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None

 try:
     moreh_ops = torch.ops.moreh
     logger.warning_once("Using moreh ops")
 except AttributeError:
     logger.warning_once("Failed to import moreh ops")
 #_CHECKPOINT_FOR_DOC = "moreh/Motif-102B"
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in float16 just to be sure everything works as expected.
         input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
             if torch.is_autocast_enabled():
                 target_dtype = torch.get_autocast_gpu_dtype()
             # Handle the case where the model is quantized
         value_states = value_states.transpose(1, 2)
         if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None
+                and self.layer_idx >= self.config.max_window_layers):
             sliding_window = self.config.sliding_window
         else:
             sliding_window = None
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None