Add flash_attn_func + harden MPS dispatch for transformers compatibility

by ArthurZ HF Staff - opened 11 days ago

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+54

-7

Files changed (1) hide show

torch-ext/metal_flash_sdpa/_custom_ops.py +54 -7

torch-ext/metal_flash_sdpa/_custom_ops.py CHANGED Viewed

@@ -20,7 +20,7 @@ def flash_attention_varlen(
 ) -> None:
     """
     Flash Attention with variable-length sequences.
     Args:
         out: Output tensor of shape [total_q_tokens, num_heads, head_dim]
         query: Query tensor of shape [total_q_tokens, num_heads, head_dim]
@@ -33,7 +33,7 @@ def flash_attention_varlen(
         do_causal: Whether to apply causal masking
         scale: Attention scale factor (default: 1/sqrt(head_dim))
         softcapping: Softcapping value (default: 1.0, must be 1.0 for this implementation)
     Note:
         - cu_seqlens_q and cu_seqlens_k must have dtype torch.int32 for Metal compatibility
         - Supported head dimensions: 32, 64, 72, 80, 96, 128
@@ -41,7 +41,7 @@ def flash_attention_varlen(
     """
     if scale is None:
         scale = query.shape[-1] ** -0.5
     ops.flash_attention_varlen(
         out,
         query,
@@ -56,6 +56,45 @@ def flash_attention_varlen(
         softcapping,
     )
 def flash_attn_varlen_func(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -74,7 +113,7 @@ def flash_attn_varlen_func(
 ) -> torch.Tensor:
     """
     Flash Attention function with API compatible with the original Flash Attention.
     Note: This implementation does not support:
     - dropout
     - window attention
@@ -89,10 +128,18 @@ def flash_attn_varlen_func(
         raise NotImplementedError("ALiBi is not supported")
     if return_attn_probs:
         raise NotImplementedError("Returning attention probabilities is not supported")
     # Create output tensor
     out = torch.empty_like(q)
     # Call the kernel
     flash_attention_varlen(
         out=out,
@@ -107,7 +154,7 @@ def flash_attn_varlen_func(
         scale=softmax_scale,
         softcapping=1.0,
     )
     return out

 ) -> None:
     """
     Flash Attention with variable-length sequences.
     Args:
         out: Output tensor of shape [total_q_tokens, num_heads, head_dim]
         query: Query tensor of shape [total_q_tokens, num_heads, head_dim]
         do_causal: Whether to apply causal masking
         scale: Attention scale factor (default: 1/sqrt(head_dim))
         softcapping: Softcapping value (default: 1.0, must be 1.0 for this implementation)
     Note:
         - cu_seqlens_q and cu_seqlens_k must have dtype torch.int32 for Metal compatibility
         - Supported head dimensions: 32, 64, 72, 80, 96, 128
     """
     if scale is None:
         scale = query.shape[-1] ** -0.5
     ops.flash_attention_varlen(
         out,
         query,
         softcapping,
     )
+def _prepare_varlen_inputs(q, k, v, cu_seqlens_q, cu_seqlens_k):
+    """Normalize Q/K/V and cumulative-length tensors before dispatching to the Metal kernel.
+    The kernel reads its inputs as flat strided buffers and expects:
+      - contiguous Q/K/V (decode-time cached K/V from PyTorch is typically a transposed view)
+      - int32 cumulative sequence lengths (Metal pointer type)
+      - distinct buffers for `cu_seqlens_q` and `cu_seqlens_k` (passing the same tensor in both
+        slots aliases the same Metal argument and yields incorrect attention scores)
+    Caller-provided ``cu_seqlens_q``/``cu_seqlens_k`` are preserved as-is; this function only
+    performs the dtype / contiguity / aliasing fixups the kernel needs. ``cu_seqlens`` are never
+    synthesized here — that responsibility stays with the caller (continuous batching, padding-free
+    training, etc.).
+    """
+    needs_sync = False
+    if not q.is_contiguous():
+        q = q.contiguous()
+        needs_sync = needs_sync or q.is_mps
+    if not k.is_contiguous():
+        k = k.contiguous()
+        needs_sync = needs_sync or k.is_mps
+    if not v.is_contiguous():
+        v = v.contiguous()
+        needs_sync = needs_sync or v.is_mps
+    if cu_seqlens_q.dtype != torch.int32:
+        cu_seqlens_q = cu_seqlens_q.to(torch.int32)
+        needs_sync = needs_sync or cu_seqlens_q.is_mps
+    if cu_seqlens_k.dtype != torch.int32:
+        cu_seqlens_k = cu_seqlens_k.to(torch.int32)
+        needs_sync = needs_sync or cu_seqlens_k.is_mps
+    if cu_seqlens_k.data_ptr() == cu_seqlens_q.data_ptr():
+        cu_seqlens_k = cu_seqlens_k.clone()
+        needs_sync = needs_sync or cu_seqlens_k.is_mps
+    if needs_sync and torch.backends.mps.is_available():
+        torch.mps.synchronize()
+    return q, k, v, cu_seqlens_q, cu_seqlens_k
 def flash_attn_varlen_func(
     q: torch.Tensor,
     k: torch.Tensor,
 ) -> torch.Tensor:
     """
     Flash Attention function with API compatible with the original Flash Attention.
     Note: This implementation does not support:
     - dropout
     - window attention
         raise NotImplementedError("ALiBi is not supported")
     if return_attn_probs:
         raise NotImplementedError("Returning attention probabilities is not supported")
+    q, k, v, cu_seqlens_q, cu_seqlens_k = _prepare_varlen_inputs(q, k, v, cu_seqlens_q, cu_seqlens_k)
     # Create output tensor
     out = torch.empty_like(q)
+    # Flush any pending Metal encoder before launching the custom kernel; without this, a
+    # preceding op (e.g. `.contiguous()` on a transposed cache view) leaves an encoder open
+    # and the kernel trips ``"A command encoder is already encoding to this command buffer"``.
+    if q.is_mps and torch.backends.mps.is_available():
+        torch.mps.synchronize()
     # Call the kernel
     flash_attention_varlen(
         out=out,
         scale=softmax_scale,
         softcapping=1.0,
     )
     return out