set flash attn as option in config

Files changed (3) hide show

configuration_bert.py +4 -0
flash_attn_triton.py +9 -32
modeling_bert.py +17 -6

configuration_bert.py CHANGED Viewed

@@ -127,6 +127,8 @@ class JinaBertConfig(PretrainedConfig):
         emb_pooler (`str`, *optional*, defaults to `None`):
             The function to use for pooling the last layer embeddings to get the sentence embeddings.
             Should be one of `None`, `"mean"`.
     Examples:
@@ -164,6 +166,7 @@ class JinaBertConfig(PretrainedConfig):
         classifier_dropout=None,
         feed_forward_type="original",
         emb_pooler=None,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -185,6 +188,7 @@ class JinaBertConfig(PretrainedConfig):
         self.classifier_dropout = classifier_dropout
         self.feed_forward_type = feed_forward_type
         self.emb_pooler = emb_pooler
 class JinaBertOnnxConfig(OnnxConfig):

         emb_pooler (`str`, *optional*, defaults to `None`):
             The function to use for pooling the last layer embeddings to get the sentence embeddings.
             Should be one of `None`, `"mean"`.
+        with_flash (`bool`, *optional*, defaults to `False`):
+            Whether to use flash attention. Only works for `triton==2.0.0.dev20230208`
     Examples:
         classifier_dropout=None,
         feed_forward_type="original",
         emb_pooler=None,
+        with_flash=False,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
         self.classifier_dropout = classifier_dropout
         self.feed_forward_type = feed_forward_type
         self.emb_pooler = emb_pooler
+        self.with_flash = with_flash
 class JinaBertOnnxConfig(OnnxConfig):

flash_attn_triton.py CHANGED Viewed

@@ -81,21 +81,11 @@ def _fwd_kernel(
     Lse,
     TMP,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
     softmax_scale,
-    stride_qb,
-    stride_qh,
-    stride_qm,
-    stride_kb,
-    stride_kh,
-    stride_kn,
-    stride_vb,
-    stride_vh,
-    stride_vn,
-    stride_bb,
-    stride_bh,
-    stride_bm,
-    stride_ob,
-    stride_oh,
-    stride_om,
     nheads,
     seqlen_q,
     seqlen_k,
@@ -316,11 +306,6 @@ def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
         elif bias.shape[2:] == (seqlen_q, seqlen_k):
             bias_type = 'matrix'
         else:
-            print(q.shape)
-            print(k.shape)
-            print(seqlen_q)
-            print(seqlen_k)
-            print(bias.shape)
             raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k)'
                                ' or (seqlen_q, seqlen_k)')
         if bias.shape[:2] == (1, nheads):
@@ -359,19 +344,11 @@ def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
         lse,
         tmp,
         softmax_scale,
-        q.stride(0),
-        q.stride(2),
-        q.stride(1),
-        k.stride(0),
-        k.stride(2),
-        k.stride(1),
-        v.stride(0),
-        v.stride(2),
-        v.stride(1),
         *bias_strides,
-        o.stride(0),
-        o.stride(2),
-        o.stride(1),
         nheads,
         seqlen_q,
         seqlen_k,

     Lse,
     TMP,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
     softmax_scale,
+    stride_qb, stride_qh, stride_qm,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_bb, stride_bh, stride_bm,
+    stride_ob, stride_oh, stride_om,
     nheads,
     seqlen_q,
     seqlen_k,
         elif bias.shape[2:] == (seqlen_q, seqlen_k):
             bias_type = 'matrix'
         else:
             raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k)'
                                ' or (seqlen_q, seqlen_k)')
         if bias.shape[:2] == (1, nheads):
         lse,
         tmp,
         softmax_scale,
+        q.stride(0), q.stride(2), q.stride(1),
+        k.stride(0), k.stride(2), k.stride(1),
+        v.stride(0), v.stride(2), v.stride(1),
         *bias_strides,
+        o.stride(0), o.stride(2), o.stride(1),
         nheads,
         seqlen_q,
         seqlen_k,

modeling_bert.py CHANGED Viewed

@@ -55,7 +55,10 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from .configuration_bert import JinaBertConfig
-from .flash_attn_triton import flash_attn_func
 try:
     from tqdm.autonotebook import trange
@@ -282,7 +285,7 @@ class JinaBertEmbeddings(nn.Module):
 class JinaBertSelfAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
             config, "embedding_size"
@@ -291,6 +294,13 @@ class JinaBertSelfAttention(nn.Module):
                 f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                 f"heads ({config.num_attention_heads})"
             )
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
@@ -334,14 +344,15 @@ class JinaBertSelfAttention(nn.Module):
         output_attentions: Optional[bool] = False,
         bias: Optional[torch.FloatTensor] = None,
     ) -> Tuple[torch.Tensor]:
-        if False:
             b, s, h = hidden_states.shape
             q = self.query(hidden_states)
             k = self.key(hidden_states)
             v = self.value(hidden_states)
-            q = self.transpose_for_scores(q)
-            k = self.transpose_for_scores(k)
-            v = self.transpose_for_scores(v)
             attn = flash_attn_func(q, k, v, bias)
             return (attn.view(b, s, h),)
         mixed_query_layer = self.query(hidden_states)

     replace_return_docstrings,
 )
 from .configuration_bert import JinaBertConfig
+try:
+    from .flash_attn_triton import flash_attn_func
+except Exception:
+    flash_attn_func = None
 try:
     from tqdm.autonotebook import trange
 class JinaBertSelfAttention(nn.Module):
+    def __init__(self, config: JinaBertConfig, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
             config, "embedding_size"
                 f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                 f"heads ({config.num_attention_heads})"
             )
+        self.with_flash = config.with_flash
+        if self.with_flash:
+            if flash_attn_func is None:
+                raise ValueError(
+                    f"flash_attn_func is None, please install flash_attn_triton"
+                )
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         output_attentions: Optional[bool] = False,
         bias: Optional[torch.FloatTensor] = None,
     ) -> Tuple[torch.Tensor]:
+        if self.with_flash:
             b, s, h = hidden_states.shape
             q = self.query(hidden_states)
             k = self.key(hidden_states)
             v = self.value(hidden_states)
+            # B x S x hidden_dim -> B x S x num_heads x head_dim
+            q = q.view(b, s, self.num_attention_heads, self.attention_head_size)
+            k = k.view(b, s, self.num_attention_heads, self.attention_head_size)
+            v = v.view(b, s, self.num_attention_heads, self.attention_head_size)
             attn = flash_attn_func(q, k, v, bias)
             return (attn.view(b, s, h),)
         mixed_query_layer = self.query(hidden_states)