mooncast
/

text2semantic

Safetensors

moonshot

custom_code

Model card Files Files and versions

xet

Community

mrfakename commited on Apr 4

Commit

e0b12cf

verified ·

1 Parent(s): 712bcbf

Support no flash-attn

Browse files

Files changed (1) hide show

modeling_moonshot.py +46 -2

modeling_moonshot.py CHANGED Viewed

@@ -27,13 +27,18 @@ else:
     from transformers.utils import is_flash_attn_available
 from .configuration_moonshot import MoonshotConfig
 import math
 if is_flash_attn_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 else:
-    raise RuntimeError("flash attention must be installed")
 logger = logging.get_logger(__name__)
@@ -380,6 +385,13 @@ class Attention(nn.Module):
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
         # Contains at least one padding token in the sequence
         if padding_mask is not None:
             batch_size = query_states.shape[0]
@@ -411,6 +423,38 @@ class Attention(nn.Module):
         return attn_output
 class DecoderLayer(nn.Module):
     def __init__(self, config: MoonshotConfig):
@@ -854,4 +898,4 @@ class MoonshotForCausalLM(MoonshotPreTrainedModel):
             reordered_past += (
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
-        return reordered_past

     from transformers.utils import is_flash_attn_available
 from .configuration_moonshot import MoonshotConfig
 import math
+import logging
+logger = logging.getLogger(__name__)
 if is_flash_attn_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+    _flash_attn_2_available = True
 else:
+    _flash_attn_2_available = False
+    logger.warning("Flash Attention 2 is not available. Falling back to standard attention.")
 logger = logging.get_logger(__name__)
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
+        if not _flash_attn_2_available:
+            return self._standard_attention(
+                query_states, key_states, value_states,
+                attention_mask=padding_mask, query_length=query_length,
+                dropout=dropout, softmax_scale=softmax_scale
+            )
         # Contains at least one padding token in the sequence
         if padding_mask is not None:
             batch_size = query_states.shape[0]
         return attn_output
+    def _standard_attention(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        # Standard scaled dot-product attention
+        batch_size, q_length, num_heads, head_dim = query_states.shape
+        # Prepare the query, key, value for attention computation
+        # (batch_size, num_heads, seq_length, head_dim)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        # (batch_size, num_heads, query_length, key_length)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+        if softmax_scale is None:
+            softmax_scale = 1.0 / math.sqrt(head_dim)
+        attn_weights = attn_weights * softmax_scale
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        # Apply softmax and dropout
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=self.training)
+        # Context vectors
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        return attn_output
 class DecoderLayer(nn.Module):
     def __init__(self, config: MoonshotConfig):
             reordered_past += (
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
+        return reordered_past