danieldk HF Staff commited on Jan 20

Commit

98af189

verified ·

1 Parent(s): dd401a6

Build uploaded using `kernels`.

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch210-cxx11-cpu-x86_64-linux/{_flash_attn2_5e9f49f.abi3.so → _flash_attn2_588b404.abi3.so} +1 -1
build/torch210-cxx11-cpu-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cpu-x86_64-linux/metadata.json +4 -1
build/torch210-cxx11-cu126-x86_64-linux/__init__.py +393 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2/_flash_attn_9e27194.abi3.so → torch210-cxx11-cu126-x86_64-linux/_flash_attn2_588b404.abi3.so} +2 -2
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/_ops.py +3 -3
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/bert_padding.py +0 -0
build/torch210-cxx11-cu126-x86_64-linux/flash_attn2/__init__.py +26 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/flash_attn_interface.py +29 -18
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/layers/__init__.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/layers/patch_embed.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/layers/rotary.py +0 -0
build/torch210-cxx11-cu126-x86_64-linux/metadata.json +4 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/__init__.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/activations.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/fused_dense.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/layer_norm.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/rms_norm.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/__init__.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/cross_entropy.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/k_activations.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/layer_norm.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/linear.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/mlp.py +0 -0
build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/rotary.py +2 -1
build/torch210-cxx11-cu128-x86_64-linux/__init__.py +393 -0
build/torch210-cxx11-cu128-x86_64-linux/_flash_attn2_588b404.abi3.so +3 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/_ops.py +3 -3
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/bert_padding.py +0 -0
build/torch210-cxx11-cu128-x86_64-linux/flash_attn2/__init__.py +26 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/flash_attn_interface.py +29 -18
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/layers/__init__.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/layers/patch_embed.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/layers/rotary.py +0 -0
build/torch210-cxx11-cu128-x86_64-linux/metadata.json +4 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/__init__.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/activations.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/fused_dense.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/layer_norm.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/rms_norm.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/__init__.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/cross_entropy.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/k_activations.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/layer_norm.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/linear.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/mlp.py +0 -0
build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/rotary.py +2 -1
build/torch210-cxx11-cu130-x86_64-linux/__init__.py +393 -0
build/torch210-cxx11-cu130-x86_64-linux/_flash_attn2_588b404.abi3.so +3 -0
build/{torch28-cxx11-cu129-x86_64-linux/flash_attn2 → torch210-cxx11-cu130-x86_64-linux}/_ops.py +3 -3

build/torch210-cxx11-cpu-x86_64-linux/{_flash_attn2_5e9f49f.abi3.so → _flash_attn2_588b404.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5f51f42997f5a6c02f137bfba9add39400e48486aaed79a78ec5081215c487e3
 size 249504

 version https://git-lfs.github.com/spec/v1
+oid sha256:1d90d30dbcf574c7a50f2c9774884370e71e1e177062c6a233fcc7e1940fffcb
 size 249504

build/torch210-cxx11-cpu-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_5e9f49f
-ops = torch.ops._flash_attn2_5e9f49f
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_5e9f49f::{op_name}"

 import torch
+from . import _flash_attn2_588b404
+ops = torch.ops._flash_attn2_588b404
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_588b404::{op_name}"

build/torch210-cxx11-cpu-x86_64-linux/metadata.json CHANGED Viewed

	@@ -1 +1,4 @@
1	- {~~"python-depends":[]}~~

+{
+  "version": 1,
+  "python-depends": []
+}

build/torch210-cxx11-cu126-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,393 @@

+from typing import Optional, List
+import torch
+from ._ops import ops as flash_attn_ops
+from .flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_kvpacked_func,
+    flash_attn_qkvpacked_func,
+    flash_attn_varlen_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func,
+    flash_attn_with_kvcache,
+)
+def fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2/_flash_attn_9e27194.abi3.so → torch210-cxx11-cu126-x86_64-linux/_flash_attn2_588b404.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b567f9d044f6ca11f5a5fa2ba6d0fdb7573b7abcfe8d6ef875df44703ed020e1
-size 448643576

 version https://git-lfs.github.com/spec/v1
+oid sha256:247ade2063814573447dcb697fd39e738bcf5f0f5d40ac87eaf6cf6dba29298f
+size 448708992

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/_ops.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_9e27194
-ops = torch.ops._flash_attn_9e27194
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_9e27194::{op_name}"

 import torch
+from . import _flash_attn2_588b404
+ops = torch.ops._flash_attn2_588b404
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_588b404::{op_name}"

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/bert_padding.py RENAMED Viewed

File without changes

build/torch210-cxx11-cu126-x86_64-linux/flash_attn2/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/flash_attn_interface.py RENAMED Viewed

@@ -10,12 +10,12 @@ import os
 # # We need to import the CUDA kernels after importing torch
 # USE_TRITON_ROCM = os.getenv("FLASH_ATTENTION_TRITON_AMD_ENABLE", "FALSE") == "TRUE"
 # if USE_TRITON_ROCM:
-#     from .flash_attn_triton_amd import interface_fa as flash_attn_gpu
 # else:
-#     import flash_attn_2_cuda as flash_attn_gpu
-from ._ops import ops as flash_attn_gpu
 # # isort: on
@@ -23,6 +23,17 @@ def maybe_contiguous(x):
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
 def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     # This should match the block sizes in the CUDA kernel
     assert head_dim <= 256
@@ -76,7 +87,7 @@ else:
     _torch_register_fake_wrapper = noop_register_fake_wrapper
-@_torch_custom_op_wrapper("flash_attn::_flash_attn_forward", mutates_args=(), device_types="cuda")
 def _flash_attn_forward(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -91,7 +102,7 @@ def _flash_attn_forward(
     return_softmax: bool
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
-    out, softmax_lse, S_dmask, rng_state = flash_attn_gpu.fwd(
         q,
         k,
         v,
@@ -142,7 +153,7 @@ else:
     _wrapped_flash_attn_forward = _flash_attn_forward
-@_torch_custom_op_wrapper("flash_attn::_flash_attn_varlen_forward", mutates_args=(), device_types="cuda")
 def _flash_attn_varlen_forward(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -165,7 +176,7 @@ def _flash_attn_varlen_forward(
     zero_tensors: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
-    out, softmax_lse, S_dmask, rng_state = flash_attn_gpu.varlen_fwd(
         q,
         k,
         v,
@@ -237,7 +248,7 @@ else:
     _wrapped_flash_attn_varlen_forward = _flash_attn_varlen_forward
-@_torch_custom_op_wrapper("flash_attn::_flash_attn_backward", mutates_args=("dq", "dk", "dv"), device_types="cuda")
 def _flash_attn_backward(
     dout: torch.Tensor,
     q: torch.Tensor,
@@ -265,7 +276,7 @@ def _flash_attn_backward(
         dk,
         dv,
         softmax_d,
-    ) = flash_attn_gpu.bwd(
         dout,
         q,
         k,
@@ -329,7 +340,7 @@ else:
     _wrapped_flash_attn_backward = _flash_attn_backward
-@_torch_custom_op_wrapper("flash_attn::_flash_attn_varlen_backward", mutates_args=("dq", "dk", "dv"), device_types="cuda")
 def _flash_attn_varlen_backward(
     dout: torch.Tensor,
     q: torch.Tensor,
@@ -362,7 +373,7 @@ def _flash_attn_varlen_backward(
         dk,
         dv,
         softmax_d,
-    ) = flash_attn_gpu.varlen_bwd(
         dout,
         q,
         k,
@@ -1053,7 +1064,7 @@ def flash_attn_qkvpacked_func(
         alibi_slopes,
         deterministic,
         return_attn_probs,
-        torch.is_grad_enabled(),
     )
@@ -1131,7 +1142,7 @@ def flash_attn_kvpacked_func(
         alibi_slopes,
         deterministic,
         return_attn_probs,
-        torch.is_grad_enabled(),
     )
@@ -1208,7 +1219,7 @@ def flash_attn_func(
         alibi_slopes,
         deterministic,
         return_attn_probs,
-        torch.is_grad_enabled(),
     )
@@ -1274,7 +1285,7 @@ def flash_attn_varlen_qkvpacked_func(
         alibi_slopes,
         deterministic,
         return_attn_probs,
-        torch.is_grad_enabled(),
     )
@@ -1366,7 +1377,7 @@ def flash_attn_varlen_kvpacked_func(
         alibi_slopes,
         deterministic,
         return_attn_probs,
-        torch.is_grad_enabled(),
     )
@@ -1460,7 +1471,7 @@ def flash_attn_varlen_func(
         deterministic,
         return_attn_probs,
         block_table,
-        torch.is_grad_enabled(),
     )
@@ -1584,7 +1595,7 @@ def flash_attn_with_kvcache(
         cache_seqlens = maybe_contiguous(cache_seqlens)
     cache_batch_idx = maybe_contiguous(cache_batch_idx)
     block_table = maybe_contiguous(block_table)
-    out, softmax_lse = flash_attn_gpu.fwd_kvcache(
         q,
         k_cache,
         v_cache,

 # # We need to import the CUDA kernels after importing torch
 # USE_TRITON_ROCM = os.getenv("FLASH_ATTENTION_TRITON_AMD_ENABLE", "FALSE") == "TRUE"
 # if USE_TRITON_ROCM:
+#     from .flash_attn_triton_amd import interface_fa as flash_attn
 # else:
+#     import flash_attn_2_cuda as flash_attn
+from ._ops import ops as flash_attn
 # # isort: on
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+def _get_device():
+    if torch.xpu.is_available():
+        return "xpu"
+    elif torch.cuda.is_available():
+        return "cuda"
+    else:
+        return "cpu"
+_XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
 def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     # This should match the block sizes in the CUDA kernel
     assert head_dim <= 256
     _torch_register_fake_wrapper = noop_register_fake_wrapper
+@_torch_custom_op_wrapper("flash_attn::_flash_attn_forward", mutates_args=(), device_types=_get_device())
 def _flash_attn_forward(
     q: torch.Tensor,
     k: torch.Tensor,
     return_softmax: bool
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, softmax_lse, S_dmask, rng_state = flash_attn.fwd(
         q,
         k,
         v,
     _wrapped_flash_attn_forward = _flash_attn_forward
+@_torch_custom_op_wrapper("flash_attn::_flash_attn_varlen_forward", mutates_args=(), device_types=_get_device())
 def _flash_attn_varlen_forward(
     q: torch.Tensor,
     k: torch.Tensor,
     zero_tensors: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, softmax_lse, S_dmask, rng_state = flash_attn.varlen_fwd(
         q,
         k,
         v,
     _wrapped_flash_attn_varlen_forward = _flash_attn_varlen_forward
+@_torch_custom_op_wrapper("flash_attn::_flash_attn_backward", mutates_args=("dq", "dk", "dv"), device_types=_get_device())
 def _flash_attn_backward(
     dout: torch.Tensor,
     q: torch.Tensor,
         dk,
         dv,
         softmax_d,
+    ) = flash_attn.bwd(
         dout,
         q,
         k,
     _wrapped_flash_attn_backward = _flash_attn_backward
+@_torch_custom_op_wrapper("flash_attn::_flash_attn_varlen_backward", mutates_args=("dq", "dk", "dv"), device_types=_get_device())
 def _flash_attn_varlen_backward(
     dout: torch.Tensor,
     q: torch.Tensor,
         dk,
         dv,
         softmax_d,
+    ) = flash_attn.varlen_bwd(
         dout,
         q,
         k,
         alibi_slopes,
         deterministic,
         return_attn_probs,
+        False if _XPU_AVAILABLE else torch.is_grad_enabled(),
     )
         alibi_slopes,
         deterministic,
         return_attn_probs,
+        False if _XPU_AVAILABLE else torch.is_grad_enabled(),
     )
         alibi_slopes,
         deterministic,
         return_attn_probs,
+        False if _XPU_AVAILABLE else torch.is_grad_enabled(),
     )
         alibi_slopes,
         deterministic,
         return_attn_probs,
+        False if _XPU_AVAILABLE else torch.is_grad_enabled(),
     )
         alibi_slopes,
         deterministic,
         return_attn_probs,
+        False if _XPU_AVAILABLE else torch.is_grad_enabled(),
     )
         deterministic,
         return_attn_probs,
         block_table,
+        False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
     )
         cache_seqlens = maybe_contiguous(cache_seqlens)
     cache_batch_idx = maybe_contiguous(cache_batch_idx)
     block_table = maybe_contiguous(block_table)
+    out, softmax_lse = flash_attn.fwd_kvcache(
         q,
         k_cache,
         v_cache,

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/layers/__init__.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/layers/patch_embed.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/layers/rotary.py RENAMED Viewed

File without changes

build/torch210-cxx11-cu126-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "version": 1,
+  "python-depends": []
+}

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/__init__.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/activations.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/fused_dense.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/layer_norm.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/rms_norm.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/__init__.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/cross_entropy.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/k_activations.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/layer_norm.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/linear.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/mlp.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu126-x86_64-linux/flash_attn2 → torch210-cxx11-cu126-x86_64-linux}/ops/triton/rotary.py RENAMED Viewed

@@ -155,7 +155,8 @@ def apply_rotary(
     # Need this, otherwise Triton tries to launch from cuda:0 and we get
     # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
-    with torch.cuda.device(x.device.index):
         torch.library.wrap_triton(rotary_kernel)[grid](
             output,  # data ptrs
             x,

     # Need this, otherwise Triton tries to launch from cuda:0 and we get
     # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+    device_ctx = torch.cuda.device(x.device.index) if x.device.type == 'cuda' else torch.xpu.device(x.device.index)
+    with device_ctx:
         torch.library.wrap_triton(rotary_kernel)[grid](
             output,  # data ptrs
             x,

build/torch210-cxx11-cu128-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,393 @@

+from typing import Optional, List
+import torch
+from ._ops import ops as flash_attn_ops
+from .flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_kvpacked_func,
+    flash_attn_qkvpacked_func,
+    flash_attn_varlen_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func,
+    flash_attn_with_kvcache,
+)
+def fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch210-cxx11-cu128-x86_64-linux/_flash_attn2_588b404.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09cfe096dc8f0010e99225d44263e4d9172d4b542d48d656b3b9fd718ca55b7d
+size 1037803376

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/_ops.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_9e27194
-ops = torch.ops._flash_attn_9e27194
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_9e27194::{op_name}"

 import torch
+from . import _flash_attn2_588b404
+ops = torch.ops._flash_attn2_588b404
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_588b404::{op_name}"

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/bert_padding.py RENAMED Viewed

File without changes

build/torch210-cxx11-cu128-x86_64-linux/flash_attn2/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/flash_attn_interface.py RENAMED Viewed

@@ -10,12 +10,12 @@ import os
 # # We need to import the CUDA kernels after importing torch
 # USE_TRITON_ROCM = os.getenv("FLASH_ATTENTION_TRITON_AMD_ENABLE", "FALSE") == "TRUE"
 # if USE_TRITON_ROCM:
-#     from .flash_attn_triton_amd import interface_fa as flash_attn_gpu
 # else:
-#     import flash_attn_2_cuda as flash_attn_gpu
-from ._ops import ops as flash_attn_gpu
 # # isort: on
@@ -23,6 +23,17 @@ def maybe_contiguous(x):
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
 def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     # This should match the block sizes in the CUDA kernel
     assert head_dim <= 256
@@ -76,7 +87,7 @@ else:
     _torch_register_fake_wrapper = noop_register_fake_wrapper
-@_torch_custom_op_wrapper("flash_attn::_flash_attn_forward", mutates_args=(), device_types="cuda")
 def _flash_attn_forward(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -91,7 +102,7 @@ def _flash_attn_forward(
     return_softmax: bool
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
-    out, softmax_lse, S_dmask, rng_state = flash_attn_gpu.fwd(
         q,
         k,
         v,
@@ -142,7 +153,7 @@ else:
     _wrapped_flash_attn_forward = _flash_attn_forward
-@_torch_custom_op_wrapper("flash_attn::_flash_attn_varlen_forward", mutates_args=(), device_types="cuda")
 def _flash_attn_varlen_forward(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -165,7 +176,7 @@ def _flash_attn_varlen_forward(
     zero_tensors: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
-    out, softmax_lse, S_dmask, rng_state = flash_attn_gpu.varlen_fwd(
         q,
         k,
         v,
@@ -237,7 +248,7 @@ else:
     _wrapped_flash_attn_varlen_forward = _flash_attn_varlen_forward
-@_torch_custom_op_wrapper("flash_attn::_flash_attn_backward", mutates_args=("dq", "dk", "dv"), device_types="cuda")
 def _flash_attn_backward(
     dout: torch.Tensor,
     q: torch.Tensor,
@@ -265,7 +276,7 @@ def _flash_attn_backward(
         dk,
         dv,
         softmax_d,
-    ) = flash_attn_gpu.bwd(
         dout,
         q,
         k,
@@ -329,7 +340,7 @@ else:
     _wrapped_flash_attn_backward = _flash_attn_backward
-@_torch_custom_op_wrapper("flash_attn::_flash_attn_varlen_backward", mutates_args=("dq", "dk", "dv"), device_types="cuda")
 def _flash_attn_varlen_backward(
     dout: torch.Tensor,
     q: torch.Tensor,
@@ -362,7 +373,7 @@ def _flash_attn_varlen_backward(
         dk,
         dv,
         softmax_d,
-    ) = flash_attn_gpu.varlen_bwd(
         dout,
         q,
         k,
@@ -1053,7 +1064,7 @@ def flash_attn_qkvpacked_func(
         alibi_slopes,
         deterministic,
         return_attn_probs,
-        torch.is_grad_enabled(),
     )
@@ -1131,7 +1142,7 @@ def flash_attn_kvpacked_func(
         alibi_slopes,
         deterministic,
         return_attn_probs,
-        torch.is_grad_enabled(),
     )
@@ -1208,7 +1219,7 @@ def flash_attn_func(
         alibi_slopes,
         deterministic,
         return_attn_probs,
-        torch.is_grad_enabled(),
     )
@@ -1274,7 +1285,7 @@ def flash_attn_varlen_qkvpacked_func(
         alibi_slopes,
         deterministic,
         return_attn_probs,
-        torch.is_grad_enabled(),
     )
@@ -1366,7 +1377,7 @@ def flash_attn_varlen_kvpacked_func(
         alibi_slopes,
         deterministic,
         return_attn_probs,
-        torch.is_grad_enabled(),
     )
@@ -1460,7 +1471,7 @@ def flash_attn_varlen_func(
         deterministic,
         return_attn_probs,
         block_table,
-        torch.is_grad_enabled(),
     )
@@ -1584,7 +1595,7 @@ def flash_attn_with_kvcache(
         cache_seqlens = maybe_contiguous(cache_seqlens)
     cache_batch_idx = maybe_contiguous(cache_batch_idx)
     block_table = maybe_contiguous(block_table)
-    out, softmax_lse = flash_attn_gpu.fwd_kvcache(
         q,
         k_cache,
         v_cache,

 # # We need to import the CUDA kernels after importing torch
 # USE_TRITON_ROCM = os.getenv("FLASH_ATTENTION_TRITON_AMD_ENABLE", "FALSE") == "TRUE"
 # if USE_TRITON_ROCM:
+#     from .flash_attn_triton_amd import interface_fa as flash_attn
 # else:
+#     import flash_attn_2_cuda as flash_attn
+from ._ops import ops as flash_attn
 # # isort: on
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+def _get_device():
+    if torch.xpu.is_available():
+        return "xpu"
+    elif torch.cuda.is_available():
+        return "cuda"
+    else:
+        return "cpu"
+_XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
 def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     # This should match the block sizes in the CUDA kernel
     assert head_dim <= 256
     _torch_register_fake_wrapper = noop_register_fake_wrapper
+@_torch_custom_op_wrapper("flash_attn::_flash_attn_forward", mutates_args=(), device_types=_get_device())
 def _flash_attn_forward(
     q: torch.Tensor,
     k: torch.Tensor,
     return_softmax: bool
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, softmax_lse, S_dmask, rng_state = flash_attn.fwd(
         q,
         k,
         v,
     _wrapped_flash_attn_forward = _flash_attn_forward
+@_torch_custom_op_wrapper("flash_attn::_flash_attn_varlen_forward", mutates_args=(), device_types=_get_device())
 def _flash_attn_varlen_forward(
     q: torch.Tensor,
     k: torch.Tensor,
     zero_tensors: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, softmax_lse, S_dmask, rng_state = flash_attn.varlen_fwd(
         q,
         k,
         v,
     _wrapped_flash_attn_varlen_forward = _flash_attn_varlen_forward
+@_torch_custom_op_wrapper("flash_attn::_flash_attn_backward", mutates_args=("dq", "dk", "dv"), device_types=_get_device())
 def _flash_attn_backward(
     dout: torch.Tensor,
     q: torch.Tensor,
         dk,
         dv,
         softmax_d,
+    ) = flash_attn.bwd(
         dout,
         q,
         k,
     _wrapped_flash_attn_backward = _flash_attn_backward
+@_torch_custom_op_wrapper("flash_attn::_flash_attn_varlen_backward", mutates_args=("dq", "dk", "dv"), device_types=_get_device())
 def _flash_attn_varlen_backward(
     dout: torch.Tensor,
     q: torch.Tensor,
         dk,
         dv,
         softmax_d,
+    ) = flash_attn.varlen_bwd(
         dout,
         q,
         k,
         alibi_slopes,
         deterministic,
         return_attn_probs,
+        False if _XPU_AVAILABLE else torch.is_grad_enabled(),
     )
         alibi_slopes,
         deterministic,
         return_attn_probs,
+        False if _XPU_AVAILABLE else torch.is_grad_enabled(),
     )
         alibi_slopes,
         deterministic,
         return_attn_probs,
+        False if _XPU_AVAILABLE else torch.is_grad_enabled(),
     )
         alibi_slopes,
         deterministic,
         return_attn_probs,
+        False if _XPU_AVAILABLE else torch.is_grad_enabled(),
     )
         alibi_slopes,
         deterministic,
         return_attn_probs,
+        False if _XPU_AVAILABLE else torch.is_grad_enabled(),
     )
         deterministic,
         return_attn_probs,
         block_table,
+        False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
     )
         cache_seqlens = maybe_contiguous(cache_seqlens)
     cache_batch_idx = maybe_contiguous(cache_batch_idx)
     block_table = maybe_contiguous(block_table)
+    out, softmax_lse = flash_attn.fwd_kvcache(
         q,
         k_cache,
         v_cache,

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/layers/__init__.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/layers/patch_embed.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/layers/rotary.py RENAMED Viewed

File without changes

build/torch210-cxx11-cu128-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "version": 1,
+  "python-depends": []
+}

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/__init__.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/activations.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/fused_dense.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/layer_norm.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/rms_norm.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/__init__.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/cross_entropy.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/k_activations.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/layer_norm.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/linear.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/mlp.py RENAMED Viewed

File without changes

build/{torch28-cxx11-cu128-x86_64-linux/flash_attn2 → torch210-cxx11-cu128-x86_64-linux}/ops/triton/rotary.py RENAMED Viewed

@@ -155,7 +155,8 @@ def apply_rotary(
     # Need this, otherwise Triton tries to launch from cuda:0 and we get
     # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
-    with torch.cuda.device(x.device.index):
         torch.library.wrap_triton(rotary_kernel)[grid](
             output,  # data ptrs
             x,

     # Need this, otherwise Triton tries to launch from cuda:0 and we get
     # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+    device_ctx = torch.cuda.device(x.device.index) if x.device.type == 'cuda' else torch.xpu.device(x.device.index)
+    with device_ctx:
         torch.library.wrap_triton(rotary_kernel)[grid](
             output,  # data ptrs
             x,

build/torch210-cxx11-cu130-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,393 @@

+from typing import Optional, List
+import torch
+from ._ops import ops as flash_attn_ops
+from .flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_kvpacked_func,
+    flash_attn_qkvpacked_func,
+    flash_attn_varlen_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func,
+    flash_attn_with_kvcache,
+)
+def fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch210-cxx11-cu130-x86_64-linux/_flash_attn2_588b404.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:196d3756a7d099f5e23ddd53ebc47aadf558a96e1d7873f5a14faec09bb7b707
+size 1009055064

build/{torch28-cxx11-cu129-x86_64-linux/flash_attn2 → torch210-cxx11-cu130-x86_64-linux}/_ops.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn_9e27194
-ops = torch.ops._flash_attn_9e27194
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn_9e27194::{op_name}"

 import torch
+from . import _flash_attn2_588b404
+ops = torch.ops._flash_attn2_588b404
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_588b404::{op_name}"