Motif-Technologies
/

Motif-2-12.7B-Base

Text Generation

text-generation-inference

Model card Files Files and versions

dongseokmotif commited on Oct 23, 2025

Commit

a03ff9a

·

verified ·

1 Parent(s): ddfcf5f

Update modeling_motif.py

Files changed (1) hide show

modeling_motif.py +2 -6

modeling_motif.py CHANGED Viewed

@@ -38,7 +38,6 @@ if is_flash_attn_2_available():
 import einops
-MorehFlashAttention = None
 try:
     kernelRMSNorm = activation.layers.RMSNorm
     PolyNormKernel = activation.layers.PolyNorm
@@ -46,7 +45,7 @@ try:
 except AttributeError:
     kernelRMSNorm = None
     PolyNormKernel = None
-    logger.warning_once("Failed to import moreh ops")
 _CONFIG_FOR_DOC = "MotifConfig"
@@ -617,7 +616,7 @@ class MotifFlashAttention2(MotifAttention):
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in float16 just to be sure everything works as expected.
         input_dtype = query_states.dtype
-        if input_dtype == torch.float32 and MorehFlashAttention is None:
             if torch.is_autocast_enabled():
                 target_dtype = torch.get_autocast_gpu_dtype()
             # Handle the case where the model is quantized
@@ -648,7 +647,6 @@ class MotifFlashAttention2(MotifAttention):
             self.config.use_sliding_window
             and getattr(self.config, "sliding_window", None) is not None
             and self.layer_idx >= self.config.max_window_layers
-            and MorehFlashAttention is None
         ):
             sliding_window = self.config.sliding_window
         else:
@@ -1241,8 +1239,6 @@ class MotifModel(MotifPreTrainedModel):
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
-            if MorehFlashAttention is not None:
-                return attention_mask
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None

 import einops
 try:
     kernelRMSNorm = activation.layers.RMSNorm
     PolyNormKernel = activation.layers.PolyNorm
 except AttributeError:
     kernelRMSNorm = None
     PolyNormKernel = None
+    logger.warning_once("Failed to import kernel ops")
 _CONFIG_FOR_DOC = "MotifConfig"
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in float16 just to be sure everything works as expected.
         input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
             if torch.is_autocast_enabled():
                 target_dtype = torch.get_autocast_gpu_dtype()
             # Handle the case where the model is quantized
             self.config.use_sliding_window
             and getattr(self.config, "sliding_window", None) is not None
             and self.layer_idx >= self.config.max_window_layers
         ):
             sliding_window = self.config.sliding_window
         else:
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None