danieldk HF Staff commited on 17 days ago

Commit

172e232

verified ·

1 Parent(s): 4592a27

Build uploaded using `kernels`.

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch28-cxx11-xpu20251-x86_64-linux/__init__.py +393 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2/_flash_attn2_870e782_dirty.abi3.so → _flash_attn2_5dab8ba_dirty.abi3.so} +2 -2
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2/_ops.py → _ops.py} +3 -3
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2/bert_padding.py → bert_padding.py} +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/flash_attn/__init__.py +393 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2/_flash_attn2_870e782_dirty.abi3.so → torch28-cxx11-xpu20251-x86_64-linux/flash_attn/_flash_attn_c984dd4_dirty.abi3.so} +2 -2
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux/flash_attn}/_ops.py +3 -3
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux/flash_attn}/bert_padding.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/flash_attn_interface.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/layers/__init__.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/layers/patch_embed.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/layers/rotary.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/__init__.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/activations.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/fused_dense.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/layer_norm.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/rms_norm.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/__init__.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/cross_entropy.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/k_activations.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/layer_norm.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/linear.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/mlp.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/rotary.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/flash_attn2/__init__.py +26 -393
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/flash_attn_interface.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/layers/__init__.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/layers/patch_embed.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/layers/rotary.py +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/metadata.json +1 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/__init__.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/activations.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/fused_dense.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/layer_norm.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/rms_norm.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/__init__.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/cross_entropy.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/k_activations.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/layer_norm.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/linear.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/mlp.py +0 -0
build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/rotary.py +0 -0
build/torch29-cxx11-xpu20252-x86_64-linux/__init__.py +393 -0
build/torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_5dab8ba_dirty.abi3.so +3 -0
build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py +9 -0
build/torch29-cxx11-xpu20252-x86_64-linux/bert_padding.py +218 -0
build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn/__init__.py +393 -0
build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn/_flash_attn_c984dd4_dirty.abi3.so +3 -0
build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn/_ops.py +9 -0
build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn/bert_padding.py +218 -0

build/torch28-cxx11-xpu20251-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,393 @@

+from typing import Optional, List
+import torch
+from ._ops import ops as flash_attn_ops
+from .flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_kvpacked_func,
+    flash_attn_qkvpacked_func,
+    flash_attn_varlen_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func,
+    flash_attn_with_kvcache,
+)
+def fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2/_flash_attn2_870e782_dirty.abi3.so → _flash_attn2_5dab8ba_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc780f13b456fb2bc84975e2d50c257650a5b109b86c92376621eea7a983c3cd
-size 5959592

 version https://git-lfs.github.com/spec/v1
+oid sha256:f14e01c60f4a293eab27d1b34e072c8b6e37ca3a7e9cbd5b6a2bb83c195579bb
+size 8973288

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2/_ops.py → _ops.py} RENAMED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_870e782_dirty
-ops = torch.ops._flash_attn2_870e782_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_870e782_dirty::{op_name}"

 import torch
+from . import _flash_attn2_5dab8ba_dirty
+ops = torch.ops._flash_attn2_5dab8ba_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn2_5dab8ba_dirty::{op_name}"

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2/bert_padding.py → bert_padding.py} RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,393 @@

+from typing import Optional, List
+import torch
+from ._ops import ops as flash_attn_ops
+from .flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_kvpacked_func,
+    flash_attn_qkvpacked_func,
+    flash_attn_varlen_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func,
+    flash_attn_with_kvcache,
+)
+def fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2/_flash_attn2_870e782_dirty.abi3.so → torch28-cxx11-xpu20251-x86_64-linux/flash_attn/_flash_attn_c984dd4_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c11add54a0ef55746db2b14aa68814fe2440923444872747902d10f232bc6273
-size 5127712

 version https://git-lfs.github.com/spec/v1
+oid sha256:3e6ca073b589dbefd15e0160369a130677854636cae9de41f29ab6cb8d4c2123
+size 3730720

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux/flash_attn}/_ops.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn2_870e782_dirty
-ops = torch.ops._flash_attn2_870e782_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn2_870e782_dirty::{op_name}"

 import torch
+from . import _flash_attn_c984dd4_dirty
+ops = torch.ops._flash_attn_c984dd4_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn_c984dd4_dirty::{op_name}"

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux/flash_attn}/bert_padding.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/flash_attn_interface.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/layers/__init__.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/layers/patch_embed.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/layers/rotary.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/__init__.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/activations.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/fused_dense.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/layer_norm.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/rms_norm.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/__init__.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/cross_entropy.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/k_activations.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/layer_norm.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/linear.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/mlp.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/{flash_attn2 → flash_attn}/ops/triton/rotary.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/flash_attn2/__init__.py CHANGED Viewed

@@ -1,393 +1,26 @@
-from typing import Optional, List
-import torch
-from ._ops import ops as flash_attn_ops
-from .flash_attn_interface import (
-    flash_attn_func,
-    flash_attn_kvpacked_func,
-    flash_attn_qkvpacked_func,
-    flash_attn_varlen_func,
-    flash_attn_varlen_kvpacked_func,
-    flash_attn_varlen_qkvpacked_func,
-    flash_attn_with_kvcache,
-)
-def fwd(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    out: Optional[torch.Tensor] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    p_dropout: float = 0.0,
-    softmax_scale: Optional[float] = None,
-    is_causal: bool = False,
-    window_size_left: int = -1,
-    window_size_right: int = -1,
-    softcap: float = 0.0,
-    return_softmax: bool = False,
-    gen: Optional[torch.Generator] = None,
-) -> List[torch.Tensor]:
-    """
-    Forward pass for multi-head attention.
-    Args:
-        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
-        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
-        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
-        out: Optional output tensor, same shape as q
-        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
-        p_dropout: Dropout probability
-        softmax_scale: Scale factor for softmax
-        is_causal: Whether to use causal attention
-        window_size_left: Window size for left context (-1 for unlimited)
-        window_size_right: Window size for right context (-1 for unlimited)
-        softcap: Soft cap for attention weights
-        return_softmax: Whether to return softmax weights
-        gen: Optional random number generator
-    Returns:
-        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
-    """
-    if softmax_scale is None:
-        attention_head_dim = q.shape[-1]
-        softmax_scale = 1.0 / (attention_head_dim**0.5)
-    return flash_attn_ops.fwd(
-        q,
-        k,
-        v,
-        out,
-        alibi_slopes,
-        p_dropout,
-        softmax_scale,
-        is_causal,
-        window_size_left,
-        window_size_right,
-        softcap,
-        return_softmax,
-        gen,
-    )
-def varlen_fwd(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    cu_seqlens_q: torch.Tensor,
-    cu_seqlens_k: torch.Tensor,
-    out: Optional[torch.Tensor] = None,
-    seqused_k: Optional[torch.Tensor] = None,
-    leftpad_k: Optional[torch.Tensor] = None,
-    block_table: Optional[torch.Tensor] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    max_seqlen_q: int = 0,
-    max_seqlen_k: int = 0,
-    p_dropout: float = 0.0,
-    softmax_scale: Optional[float] = None,
-    zero_tensors: bool = False,
-    is_causal: bool = False,
-    window_size_left: int = -1,
-    window_size_right: int = -1,
-    softcap: float = 0.0,
-    return_softmax: bool = False,
-    gen: Optional[torch.Generator] = None,
-) -> List[torch.Tensor]:
-    """
-    Forward pass for multi-head attention with variable sequence lengths.
-    Args:
-        q: Query tensor of shape [total_q, num_heads, head_size]
-        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
-        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
-        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
-        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
-        out: Optional output tensor of shape [total_q, num_heads, head_size]
-        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
-        leftpad_k: Optional left padding for keys of shape [batch_size]
-        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
-        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
-        max_seqlen_q: Maximum sequence length for queries
-        max_seqlen_k: Maximum sequence length for keys
-        p_dropout: Dropout probability
-        softmax_scale: Scale factor for softmax
-        zero_tensors: Whether to zero tensors before computation
-        is_causal: Whether to use causal attention
-        window_size_left: Window size for left context (-1 for unlimited)
-        window_size_right: Window size for right context (-1 for unlimited)
-        softcap: Soft cap for attention weights
-        return_softmax: Whether to return softmax weights
-        gen: Optional random number generator
-    Returns:
-        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
-    """
-    if softmax_scale is None:
-        attention_head_dim = q.shape[-1]
-        softmax_scale = 1.0 / (attention_head_dim**0.5)
-    return flash_attn_ops.varlen_fwd(
-        q,
-        k,
-        v,
-        out,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        seqused_k,
-        leftpad_k,
-        block_table,
-        alibi_slopes,
-        max_seqlen_q,
-        max_seqlen_k,
-        p_dropout,
-        softmax_scale,
-        zero_tensors,
-        is_causal,
-        window_size_left,
-        window_size_right,
-        softcap,
-        return_softmax,
-        gen,
-    )
-def bwd(
-    dout: torch.Tensor,
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    out: torch.Tensor,
-    softmax_lse: torch.Tensor,
-    dq: Optional[torch.Tensor] = None,
-    dk: Optional[torch.Tensor] = None,
-    dv: Optional[torch.Tensor] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    p_dropout: float = 0.0,
-    softmax_scale: Optional[float] = None,
-    is_causal: bool = False,
-    window_size_left: int = -1,
-    window_size_right: int = -1,
-    softcap: float = 0.0,
-    deterministic: bool = False,
-    gen: Optional[torch.Generator] = None,
-    rng_state: Optional[torch.Tensor] = None,
-) -> List[torch.Tensor]:
-    """
-    Backward pass for multi-head attention.
-    Args:
-        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
-        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
-        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
-        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
-        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
-        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
-        dq: Optional gradient tensor for queries, same shape as q
-        dk: Optional gradient tensor for keys, same shape as k
-        dv: Optional gradient tensor for values, same shape as v
-        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
-        p_dropout: Dropout probability
-        softmax_scale: Scale factor for softmax
-        is_causal: Whether to use causal attention
-        window_size_left: Window size for left context (-1 for unlimited)
-        window_size_right: Window size for right context (-1 for unlimited)
-        softcap: Soft cap for attention weights
-        deterministic: Whether to use deterministic algorithms
-        gen: Optional random number generator
-        rng_state: Optional RNG state from forward pass
-    Returns:
-        List of tensors: [dq, dk, dv]
-    """
-    if softmax_scale is None:
-        attention_head_dim = q.shape[-1]
-        softmax_scale = 1.0 / (attention_head_dim**0.5)
-    return flash_attn_ops.bwd(
-        dout,
-        q,
-        k,
-        v,
-        out,
-        softmax_lse,
-        dq,
-        dk,
-        dv,
-        alibi_slopes,
-        p_dropout,
-        softmax_scale,
-        is_causal,
-        window_size_left,
-        window_size_right,
-        softcap,
-        deterministic,
-        gen,
-        rng_state,
-    )
-def varlen_bwd(
-    dout: torch.Tensor,
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    out: torch.Tensor,
-    softmax_lse: torch.Tensor,
-    cu_seqlens_q: torch.Tensor,
-    cu_seqlens_k: torch.Tensor,
-    dq: Optional[torch.Tensor] = None,
-    dk: Optional[torch.Tensor] = None,
-    dv: Optional[torch.Tensor] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    max_seqlen_q: int = 0,
-    max_seqlen_k: int = 0,
-    p_dropout: float = 0.0,
-    softmax_scale: Optional[float] = None,
-    zero_tensors: bool = False,
-    is_causal: bool = False,
-    window_size_left: int = -1,
-    window_size_right: int = -1,
-    softcap: float = 0.0,
-    deterministic: bool = False,
-    gen: Optional[torch.Generator] = None,
-    rng_state: Optional[torch.Tensor] = None,
-) -> List[torch.Tensor]:
-    """
-    Backward pass for multi-head attention with variable sequence lengths.
-    Args:
-        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
-        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
-        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
-        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
-        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
-        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
-        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
-        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
-        dq: Optional gradient tensor for queries, same shape as q
-        dk: Optional gradient tensor for keys, same shape as k
-        dv: Optional gradient tensor for values, same shape as v
-        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
-        max_seqlen_q: Maximum sequence length for queries
-        max_seqlen_k: Maximum sequence length for keys
-        p_dropout: Dropout probability
-        softmax_scale: Scale factor for softmax
-        zero_tensors: Whether to zero tensors before computation
-        is_causal: Whether to use causal attention
-        window_size_left: Window size for left context (-1 for unlimited)
-        window_size_right: Window size for right context (-1 for unlimited)
-        softcap: Soft cap for attention weights
-        deterministic: Whether to use deterministic algorithms
-        gen: Optional random number generator
-        rng_state: Optional RNG state from forward pass
-    Returns:
-        List of tensors: [dq, dk, dv]
-    """
-    if softmax_scale is None:
-        attention_head_dim = q.shape[-1]
-        softmax_scale = 1.0 / (attention_head_dim**0.5)
-    return flash_attn_ops.varlen_bwd(
-        dout,
-        q,
-        k,
-        v,
-        out,
-        softmax_lse,
-        dq,
-        dk,
-        dv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        alibi_slopes,
-        max_seqlen_q,
-        max_seqlen_k,
-        p_dropout,
-        softmax_scale,
-        zero_tensors,
-        is_causal,
-        window_size_left,
-        window_size_right,
-        softcap,
-        deterministic,
-        gen,
-        rng_state,
-    )
-def fwd_kvcache(
-    q: torch.Tensor,
-    kcache: torch.Tensor,
-    vcache: torch.Tensor,
-    k: Optional[torch.Tensor] = None,
-    v: Optional[torch.Tensor] = None,
-    seqlens_k: Optional[torch.Tensor] = None,
-    rotary_cos: Optional[torch.Tensor] = None,
-    rotary_sin: Optional[torch.Tensor] = None,
-    cache_batch_idx: Optional[torch.Tensor] = None,
-    leftpad_k: Optional[torch.Tensor] = None,
-    block_table: Optional[torch.Tensor] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    out: Optional[torch.Tensor] = None,
-    softmax_scale: Optional[float] = None,
-    is_causal: bool = False,
-    window_size_left: int = -1,
-    window_size_right: int = -1,
-    softcap: float = 0.0,
-    is_rotary_interleaved: bool = False,
-    num_splits: int = 1,
-) -> List[torch.Tensor]:
-    """
-    Forward pass for multi-head attention with KV cache.
-    Args:
-        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
-        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
-        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
-        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
-        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
-        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
-        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
-        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
-        cache_batch_idx: Optional indices to index into the KV cache
-        leftpad_k: Optional left padding for keys of shape [batch_size]
-        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
-        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
-        out: Optional output tensor, same shape as q
-        softmax_scale: Scale factor for softmax
-        is_causal: Whether to use causal attention
-        window_size_left: Window size for left context (-1 for unlimited)
-        window_size_right: Window size for right context (-1 for unlimited)
-        softcap: Soft cap for attention weights
-        is_rotary_interleaved: Whether rotary embeddings are interleaved
-        num_splits: Number of splits for computation
-    Returns:
-        List of tensors: [output, softmax_lse]
-    """
-    if softmax_scale is None:
-        attention_head_dim = q.shape[-1]
-        softmax_scale = 1.0 / (attention_head_dim**0.5)
-    return flash_attn_ops.fwd_kvcache(
-        q,
-        kcache,
-        vcache,
-        k,
-        v,
-        seqlens_k,
-        rotary_cos,
-        rotary_sin,
-        cache_batch_idx,
-        leftpad_k,
-        block_table,
-        alibi_slopes,
-        out,
-        softmax_scale,
-        is_causal,
-        window_size_left,
-        window_size_right,
-        softcap,
-        is_rotary_interleaved,
-        num_splits,
-    )

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/flash_attn_interface.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/layers/__init__.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/layers/patch_embed.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/layers/rotary.py RENAMED Viewed

File without changes

build/torch28-cxx11-xpu20251-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"python-depends":[]}

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/__init__.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/activations.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/fused_dense.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/layer_norm.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/rms_norm.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/__init__.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/cross_entropy.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/k_activations.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/layer_norm.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/linear.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/mlp.py RENAMED Viewed

File without changes

build/{torch29-cxx11-xpu20252-x86_64-linux/flash_attn2 → torch28-cxx11-xpu20251-x86_64-linux}/ops/triton/rotary.py RENAMED Viewed

File without changes

build/torch29-cxx11-xpu20252-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,393 @@

+from typing import Optional, List
+import torch
+from ._ops import ops as flash_attn_ops
+from .flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_kvpacked_func,
+    flash_attn_qkvpacked_func,
+    flash_attn_varlen_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func,
+    flash_attn_with_kvcache,
+)
+def fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_5dab8ba_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e7e91cf691aa55f859b6f983b4e3aecbf08e04f24ea4fc322e3c8123d060c9d
+size 7279344

build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flash_attn2_5dab8ba_dirty
+ops = torch.ops._flash_attn2_5dab8ba_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flash_attn2_5dab8ba_dirty::{op_name}"

build/torch29-cxx11-xpu20252-x86_64-linux/bert_padding.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+class IndexFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        # return input[indices]
+        return torch.gather(
+            rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)
+        ).reshape(-1, *other_shape)
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        grad_output = rearrange(grad_output, "b ... -> b (...)")
+        grad_input = torch.zeros(
+            [ctx.first_axis_dim, grad_output.shape[1]],
+            device=grad_output.device,
+            dtype=grad_output.dtype,
+        )
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        # grad_input[indices] = grad_output
+        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+index_first_axis = IndexFirstAxis.apply
+class IndexPutFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, values, indices, first_axis_dim):
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim >= 2
+        output = torch.zeros(
+            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype
+        )
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        output[indices] = values
+        # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        grad_values = grad_output[indices]
+        # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1]))
+        return grad_values, None, None
+index_put_first_axis = IndexPutFirstAxis.apply
+class IndexFirstAxisResidual(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        output = input[indices]
+        # We don't want to reshape input (b ... -> b (...)) since it could change the channel_last
+        # memory format to channel_first. In other words, input might not be contiguous.
+        # If we don't detach, Pytorch complains about output being a view and is being modified inplace
+        return output, input.detach()
+    @staticmethod
+    def backward(ctx, grad_output, grad_residual):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        assert grad_residual.shape[1:] == other_shape
+        grad_input = grad_residual
+        # grad_input[indices] += grad_output
+        indices = indices.reshape(indices.shape[0], *((1,) * (grad_output.ndim - 1)))
+        indices = indices.expand_as(grad_output)
+        grad_input.scatter_add_(0, indices, grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+index_first_axis_residual = IndexFirstAxisResidual.apply
+def unpad_input(hidden_states, attention_mask, unused_mask=None):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
+        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
+    """
+    all_masks = (attention_mask + unused_mask) if unused_mask is not None else attention_mask
+    seqlens_in_batch = all_masks.sum(dim=-1, dtype=torch.int32)
+    used_seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(all_masks.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+        used_seqlens_in_batch,
+    )
+def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_length):
+    """
+    Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
+    The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
+    For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
+        ```
+        [
+          [2, 3, 0, 0, 0, 0],
+          [3, 2, 0, 0, 0, 0],
+          [6, 0, 0, 0, 0, 0]
+        ]
+        ```
+    , which refers to the 3D-attention mask:
+        ```
+        [
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [0, 0, 1, 0, 0, 0],
+            [0, 0, 1, 1, 0, 0],
+            [0, 0, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]
+          ],
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0],
+            [0, 0, 0, 1, 0, 0],
+            [0, 0, 0, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]
+          ],
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0],
+            [1, 1, 1, 1, 0, 0],
+            [1, 1, 1, 1, 1, 0],
+            [1, 1, 1, 1, 1, 1]
+          ]
+        ]
+        ```.
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    length = attention_mask_in_length.sum(dim=-1)
+    seqlen = attention_mask_in_length.size(-1)
+    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
+    real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
+    seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
+    indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+def pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
+        batch: int, batch size for the padded sequence.
+        seqlen: int, maximum sequence length for the padded sequence.
+    Return:
+        hidden_states: (batch, seqlen, ...)
+    """
+    dim = hidden_states.shape[-1]
+    # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
+    # output[indices] = hidden_states
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)

build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,393 @@

+from typing import Optional, List
+import torch
+from ._ops import ops as flash_attn_ops
+from .flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_kvpacked_func,
+    flash_attn_qkvpacked_func,
+    flash_attn_varlen_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func,
+    flash_attn_with_kvcache,
+)
+def fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Optional output tensor, same shape as q
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd(
+        q,
+        k,
+        v,
+        out,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def varlen_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    return_softmax: bool = False,
+    gen: Optional[torch.Generator] = None,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with variable sequence lengths.
+    Args:
+        q: Query tensor of shape [total_q, num_heads, head_size]
+        k: Key tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        v: Value tensor of shape [total_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        out: Optional output tensor of shape [total_q, num_heads, head_size]
+        seqused_k: Optional tensor specifying how many keys to use per batch element [batch_size]
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        return_softmax: Whether to return softmax weights
+        gen: Optional random number generator
+    Returns:
+        List of tensors: [output, softmax_lse, (softmax if return_softmax)]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_k,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        return_softmax,
+        gen,
+    )
+def bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def varlen_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    max_seqlen_q: int = 0,
+    max_seqlen_k: int = 0,
+    p_dropout: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    zero_tensors: bool = False,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    gen: Optional[torch.Generator] = None,
+    rng_state: Optional[torch.Tensor] = None,
+) -> List[torch.Tensor]:
+    """
+    Backward pass for multi-head attention with variable sequence lengths.
+    Args:
+        dout: Gradient tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k: Key tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        v: Value tensor of shape [batch_size, seqlen_k, num_heads_k, head_size]
+        out: Output tensor from forward pass of shape [batch_size, seqlen_q, num_heads, head_size]
+        softmax_lse: Log-sum-exp values from forward pass of shape [batch_size, num_heads, seqlen_q]
+        cu_seqlens_q: Cumulative sequence lengths for queries of shape [batch_size+1]
+        cu_seqlens_k: Cumulative sequence lengths for keys of shape [batch_size+1]
+        dq: Optional gradient tensor for queries, same shape as q
+        dk: Optional gradient tensor for keys, same shape as k
+        dv: Optional gradient tensor for values, same shape as v
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        max_seqlen_q: Maximum sequence length for queries
+        max_seqlen_k: Maximum sequence length for keys
+        p_dropout: Dropout probability
+        softmax_scale: Scale factor for softmax
+        zero_tensors: Whether to zero tensors before computation
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        deterministic: Whether to use deterministic algorithms
+        gen: Optional random number generator
+        rng_state: Optional RNG state from forward pass
+    Returns:
+        List of tensors: [dq, dk, dv]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        p_dropout,
+        softmax_scale,
+        zero_tensors,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        gen,
+        rng_state,
+    )
+def fwd_kvcache(
+    q: torch.Tensor,
+    kcache: torch.Tensor,
+    vcache: torch.Tensor,
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seqlens_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    alibi_slopes: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    is_rotary_interleaved: bool = False,
+    num_splits: int = 1,
+) -> List[torch.Tensor]:
+    """
+    Forward pass for multi-head attention with KV cache.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        kcache: Key cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        vcache: Value cache tensor of shape [batch_size_c, seqlen_k, num_heads_k, head_size] or [num_blocks, page_block_size, num_heads_k, head_size]
+        k: Optional new keys tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        v: Optional new values tensor of shape [batch_size, seqlen_knew, num_heads_k, head_size]
+        seqlens_k: Optional sequence lengths for keys of shape [batch_size]
+        rotary_cos: Optional rotary cosine tensor of shape [seqlen_ro, rotary_dim/2]
+        rotary_sin: Optional rotary sine tensor of shape [seqlen_ro, rotary_dim/2]
+        cache_batch_idx: Optional indices to index into the KV cache
+        leftpad_k: Optional left padding for keys of shape [batch_size]
+        block_table: Optional block table of shape [batch_size, max_num_blocks_per_seq]
+        alibi_slopes: Optional ALiBi slopes tensor of shape [num_heads] or [batch_size, num_heads]
+        out: Optional output tensor, same shape as q
+        softmax_scale: Scale factor for softmax
+        is_causal: Whether to use causal attention
+        window_size_left: Window size for left context (-1 for unlimited)
+        window_size_right: Window size for right context (-1 for unlimited)
+        softcap: Soft cap for attention weights
+        is_rotary_interleaved: Whether rotary embeddings are interleaved
+        num_splits: Number of splits for computation
+    Returns:
+        List of tensors: [output, softmax_lse]
+    """
+    if softmax_scale is None:
+        attention_head_dim = q.shape[-1]
+        softmax_scale = 1.0 / (attention_head_dim**0.5)
+    return flash_attn_ops.fwd_kvcache(
+        q,
+        kcache,
+        vcache,
+        k,
+        v,
+        seqlens_k,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table,
+        alibi_slopes,
+        out,
+        softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        is_rotary_interleaved,
+        num_splits,
+    )

build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn/_flash_attn_c984dd4_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98c57dda92346f88486e974d74aee9b0fb1b1f663506666929d3c02aa897e528
+size 3420928

build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flash_attn_c984dd4_dirty
+ops = torch.ops._flash_attn_c984dd4_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flash_attn_c984dd4_dirty::{op_name}"

build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn/bert_padding.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+class IndexFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        # return input[indices]
+        return torch.gather(
+            rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)
+        ).reshape(-1, *other_shape)
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        grad_output = rearrange(grad_output, "b ... -> b (...)")
+        grad_input = torch.zeros(
+            [ctx.first_axis_dim, grad_output.shape[1]],
+            device=grad_output.device,
+            dtype=grad_output.dtype,
+        )
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        # grad_input[indices] = grad_output
+        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+index_first_axis = IndexFirstAxis.apply
+class IndexPutFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, values, indices, first_axis_dim):
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim >= 2
+        output = torch.zeros(
+            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype
+        )
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        output[indices] = values
+        # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        grad_values = grad_output[indices]
+        # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1]))
+        return grad_values, None, None
+index_put_first_axis = IndexPutFirstAxis.apply
+class IndexFirstAxisResidual(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        output = input[indices]
+        # We don't want to reshape input (b ... -> b (...)) since it could change the channel_last
+        # memory format to channel_first. In other words, input might not be contiguous.
+        # If we don't detach, Pytorch complains about output being a view and is being modified inplace
+        return output, input.detach()
+    @staticmethod
+    def backward(ctx, grad_output, grad_residual):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        assert grad_residual.shape[1:] == other_shape
+        grad_input = grad_residual
+        # grad_input[indices] += grad_output
+        indices = indices.reshape(indices.shape[0], *((1,) * (grad_output.ndim - 1)))
+        indices = indices.expand_as(grad_output)
+        grad_input.scatter_add_(0, indices, grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+index_first_axis_residual = IndexFirstAxisResidual.apply
+def unpad_input(hidden_states, attention_mask, unused_mask=None):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
+        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
+    """
+    all_masks = (attention_mask + unused_mask) if unused_mask is not None else attention_mask
+    seqlens_in_batch = all_masks.sum(dim=-1, dtype=torch.int32)
+    used_seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(all_masks.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+        used_seqlens_in_batch,
+    )
+def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_length):
+    """
+    Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
+    The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
+    For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
+        ```
+        [
+          [2, 3, 0, 0, 0, 0],
+          [3, 2, 0, 0, 0, 0],
+          [6, 0, 0, 0, 0, 0]
+        ]
+        ```
+    , which refers to the 3D-attention mask:
+        ```
+        [
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [0, 0, 1, 0, 0, 0],
+            [0, 0, 1, 1, 0, 0],
+            [0, 0, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]
+          ],
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0],
+            [0, 0, 0, 1, 0, 0],
+            [0, 0, 0, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]
+          ],
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0],
+            [1, 1, 1, 1, 0, 0],
+            [1, 1, 1, 1, 1, 0],
+            [1, 1, 1, 1, 1, 1]
+          ]
+        ]
+        ```.
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    length = attention_mask_in_length.sum(dim=-1)
+    seqlen = attention_mask_in_length.size(-1)
+    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
+    real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
+    seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
+    indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+def pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
+        batch: int, batch size for the padded sequence.
+        seqlen: int, maximum sequence length for the padded sequence.
+    Return:
+        hidden_states: (batch, seqlen, ...)
+    """
+    dim = hidden_states.shape[-1]
+    # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
+    # output[indices] = hidden_states
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)