ltg
/

norbert4-large

Model card Files Files and versions

davda54 commited on Jun 3, 2025

Commit

64f341a

·

verified ·

1 Parent(s): c96534d

fix flasshattention

Files changed (1) hide show

modeling_gptbert.py +8 -6

modeling_gptbert.py CHANGED Viewed

@@ -270,6 +270,7 @@ def flash_attention_forward(
     rotary_emb: UnpaddedRotaryEmbedding,
     cu_seqlens: torch.Tensor,
     max_seqlen: int,
     local_attention: Tuple[int, int],
     dropout_p: float,
     deterministic: bool,
@@ -289,9 +290,9 @@ def flash_attention_forward(
             qkv,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
-            # dropout_p=dropout_p,
-            # deterministic=deterministic,
-            # window_size=local_attention,
             causal=False
         )
         attn = attn.to(orig_dtype)  # type: ignore
@@ -300,9 +301,9 @@ def flash_attention_forward(
             qkv,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
-            # dropout_p=dropout_p,
-            # deterministic=deterministic,
-            # window_size=local_attention,
             causal=False
         )
     return attn
@@ -460,6 +461,7 @@ class SelfAttention(nn.Module):
                 self.rope_embedding,
                 cu_seqlens,
                 max_seqlen,
                 local_attention,
                 self.attention_dropout if self.training else 0.0,
                 self.deterministic_flash_attn

     rotary_emb: UnpaddedRotaryEmbedding,
     cu_seqlens: torch.Tensor,
     max_seqlen: int,
+    causal: bool,
     local_attention: Tuple[int, int],
     dropout_p: float,
     deterministic: bool,
             qkv,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
+            dropout_p=dropout_p,
+            deterministic=deterministic,
+            window_size=local_attention,
             causal=False
         )
         attn = attn.to(orig_dtype)  # type: ignore
             qkv,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
+            dropout_p=dropout_p,
+            deterministic=deterministic,
+            window_size=local_attention,
             causal=False
         )
     return attn
                 self.rope_embedding,
                 cu_seqlens,
                 max_seqlen,
+                self.is_causal,
                 local_attention,
                 self.attention_dropout if self.training else 0.0,
                 self.deterministic_flash_attn