Workaround for issue

Workaround for issue: get_imports failing to respect conditionals on imports
https://github.com/huggingface/transformers/issues/28459

This should allow this code to work without flash2 module installed -- and allow the code to run on a CPU.

Files changed (1) hide show

modelling_walsh.py +14 -7

modelling_walsh.py CHANGED Viewed

@@ -27,6 +27,13 @@ from transformers.utils import (
     is_flash_attn_greater_or_equal_2_10,
 )
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
@@ -825,7 +832,7 @@ class CausalSelfAttention(nn.Module):
         init.constant_(self.output_linear.bias, 0.)
     # Project QKV input through input matrices, reshape to (batch_size, n_heads, seq_len, d_model), and apply cache.
-    def project_input(self, qkv, past_key_values):
         batch_size, seq_len, d_embed = qkv.shape
         proj = self.in_proj(qkv)
         query, key, value = proj.chunk(chunks=3, dim=-1)
@@ -857,15 +864,15 @@ class CausalSelfAttention(nn.Module):
         if attn_type == "flash2":
             if use_cache is None or use_cache == False:
-                return self.flash2_forward(qkv)
             else:
-                return self.flash2_forward_cached(qkv, past_key_values)
         # qkv: (batch_size, seq_len, d_embed)
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
-        query, key, value = self.project_input(qkv, past_key_values)
         kv_seq_len = key.shape[-2]
         # Default to returning empty attention weights.
@@ -922,7 +929,7 @@ class CausalSelfAttention(nn.Module):
         )
     # No cache support, but faster
-    def flash2_forward(
         self,
         qkv,
     ):
@@ -961,7 +968,7 @@ class CausalSelfAttention(nn.Module):
     # See https://github.com/huggingface/transformers/blob/main/src/transformers/cache_utils.py
     #https://huggingface.co/docs/transformers/internal/generation_utils
-    def flash2_forward_cached(
         self,
         qkv,
         past_key_values,
@@ -969,7 +976,7 @@ class CausalSelfAttention(nn.Module):
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
-        query, key, value = self.project_input(qkv, past_key_values)
         query, key, value = self._downcast_to_float16(query, key, value)
         # Expected inputs to flash2:

     is_flash_attn_greater_or_equal_2_10,
 )
+# Workaround for https://github.com/huggingface/transformers/issues/28459
+if is_flash_attn_2_available():
+    try:
+        from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+    except:
+        print("Could not import flash2")
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
         init.constant_(self.output_linear.bias, 0.)
     # Project QKV input through input matrices, reshape to (batch_size, n_heads, seq_len, d_model), and apply cache.
+    def _project_input(self, qkv, past_key_values):
         batch_size, seq_len, d_embed = qkv.shape
         proj = self.in_proj(qkv)
         query, key, value = proj.chunk(chunks=3, dim=-1)
         if attn_type == "flash2":
             if use_cache is None or use_cache == False:
+                return self._flash2_forward(qkv)
             else:
+                return self._flash2_forward_cached(qkv, past_key_values)
         # qkv: (batch_size, seq_len, d_embed)
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
+        query, key, value = self._project_input(qkv, past_key_values)
         kv_seq_len = key.shape[-2]
         # Default to returning empty attention weights.
         )
     # No cache support, but faster
+    def _flash2_forward(
         self,
         qkv,
     ):
     # See https://github.com/huggingface/transformers/blob/main/src/transformers/cache_utils.py
     #https://huggingface.co/docs/transformers/internal/generation_utils
+    def _flash2_forward_cached(
         self,
         qkv,
         past_key_values,
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
+        query, key, value = self._project_input(qkv, past_key_values)
         query, key, value = self._downcast_to_float16(query, key, value)
         # Expected inputs to flash2: