koichi12 commited on Feb 12, 2025

Commit

c1012a5

verified ·

1 Parent(s): 1e8ae5e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/xformers/components/__init__.py +86 -0
.venv/lib/python3.11/site-packages/xformers/components/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/__pycache__/activations.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/__pycache__/input_projection.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/__pycache__/multi_head_dispatch.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/__pycache__/patch_embedding.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/__pycache__/residual.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/__pycache__/reversible.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/__pycache__/simplicial_embedding.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/activations.py +76 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/attention_patterns.py +295 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/core.py +248 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/favor.py +173 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/fourier_mix.py +35 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/lambda_layer.py +78 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/local.py +120 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/nystrom.py +295 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/random.py +126 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/scaled_dot_product.py +134 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/visual.py +96 -0
.venv/lib/python3.11/site-packages/xformers/components/input_projection.py +102 -0
.venv/lib/python3.11/site-packages/xformers/components/multi_head_dispatch.py +271 -0
.venv/lib/python3.11/site-packages/xformers/components/patch_embedding.py +83 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__init__.py +87 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/base.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/param.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/rotary.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/sine.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/vocab.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/base.py +38 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/param.py +54 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/rotary.py +91 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/sine.py +46 -0
.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/vocab.py +65 -0
.venv/lib/python3.11/site-packages/xformers/components/residual.py +192 -0
.venv/lib/python3.11/site-packages/xformers/components/reversible.py +160 -0
.venv/lib/python3.11/site-packages/xformers/components/simplicial_embedding.py +67 -0
.venv/lib/python3.11/site-packages/xformers/ops/__init__.py +130 -0
.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/_triton/k_index_select_cat.py +184 -0
.venv/lib/python3.11/site-packages/xformers/ops/_triton/k_scaled_index_add.py +365 -0
.venv/lib/python3.11/site-packages/xformers/ops/_triton/rmsnorm_kernels.py +163 -0
.venv/lib/python3.11/site-packages/xformers/ops/_triton/rope_padded_kernels.py +226 -0
.venv/lib/python3.11/site-packages/xformers/ops/_triton/tiled_matmul_kernels.py +430 -0
.venv/lib/python3.11/site-packages/xformers/ops/fmha/__init__.py +893 -0
.venv/lib/python3.11/site-packages/xformers/ops/fmha/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/fmha/__pycache__/attn_bias.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/fmha/__pycache__/ck.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/ops/fmha/__pycache__/ck_decoder.cpython-311.pyc +0 -0

.venv/lib/python3.11/site-packages/xformers/components/__init__.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import warnings
+from dataclasses import fields
+from pathlib import Path
+from typing import Any, Dict, Union
+from xformers.utils import import_all_modules
+from .activations import Activation, build_activation  # noqa
+from .attention import Attention, build_attention  # noqa
+from .input_projection import InputProjection, InputProjectionConfig  # noqa
+from .multi_head_dispatch import MultiHeadDispatch  # noqa
+from .multi_head_dispatch import MultiHeadDispatchConfig
+from .patch_embedding import PatchEmbeddingConfig  # noqa
+from .patch_embedding import build_patch_embedding  # noqa
+from .residual import NormalizationType  # noqa
+from .residual import PostNorm  # noqa
+from .residual import PreNorm  # noqa
+from .residual import RequiresWrappedInputs  # noqa
+from .residual import Residual  # noqa
+from .residual import ResidualNormStyle  # noqa
+warnings.warn(
+    "xformers.components is deprecated and is not maintained anymore. "
+    "It might be removed in a future version of xFormers ",
+    FutureWarning,
+    stacklevel=2,
+)
+# automatically import any Python files in the directory
+import_all_modules(str(Path(__file__).parent), "xformers.components")
+def build_multi_head_attention(
+    multi_head_config: Union[MultiHeadDispatchConfig, Dict[str, Any]],
+):
+    """Builds a multihead attention from a config.
+    This assumes a 'name' key in the config which is used to determine what
+    attention class to instantiate. For instance, a config `{"name": "my_attention",
+    "foo": "bar"}` will find a class that was registered as "my_attention"
+    (see :func:`register_attention`) and call .from_config on it."""
+    if not isinstance(multi_head_config, MultiHeadDispatchConfig):
+        # Extract the required fields
+        field_names = list(map(lambda x: x.name, fields(MultiHeadDispatchConfig)))
+        # The missing fields get Noned
+        for k in field_names:
+            if k not in multi_head_config.keys():
+                multi_head_config[k] = None
+        # Could be that the attention needs to be instantiated
+        if not isinstance(multi_head_config["attention"], Attention):
+            # Convenience: fill in possible missing fields
+            if "num_heads" not in multi_head_config["attention"]:
+                multi_head_config["attention"]["num_heads"] = multi_head_config[
+                    "num_heads"
+                ]
+            if "dim_model" not in multi_head_config["attention"]:
+                multi_head_config["attention"]["dim_model"] = multi_head_config[
+                    "dim_model"
+                ]
+            if (
+                "dim_features" not in multi_head_config["attention"]
+                or multi_head_config["attention"]["dim_features"] is None
+            ):
+                multi_head_config["attention"]["dim_features"] = (
+                    multi_head_config["dim_model"] // multi_head_config["num_heads"]
+                )
+            multi_head_config["attention"] = build_attention(
+                multi_head_config["attention"]
+            )
+        multi_head_config = MultiHeadDispatchConfig(**multi_head_config)
+    return MultiHeadDispatch.from_config(multi_head_config)

.venv/lib/python3.11/site-packages/xformers/components/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (3.57 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/__pycache__/activations.cpython-311.pyc ADDED Viewed

Binary file (4.59 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/__pycache__/input_projection.cpython-311.pyc ADDED Viewed

Binary file (3.97 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/__pycache__/multi_head_dispatch.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/__pycache__/patch_embedding.cpython-311.pyc ADDED Viewed

Binary file (4.53 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/__pycache__/residual.cpython-311.pyc ADDED Viewed

Binary file (9.56 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/__pycache__/reversible.cpython-311.pyc ADDED Viewed

Binary file (9.78 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/__pycache__/simplicial_embedding.cpython-311.pyc ADDED Viewed

Binary file (3.5 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/activations.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Optional
+import torch
+from torch import nn
+from xformers._deprecation_warning import deprecated_function
+class Activation(str, Enum):
+    SquaredReLU = "squared_relu"
+    GeLU = "gelu"
+    LeakyReLU = "leaky_relu"
+    ReLU = "relu"
+    SmeLU = "smelu"
+    StarReLU = "star_relu"
+# For unit testing / parity comparisons, probably not the fastest way
+class SquaredReLU(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        deprecated_function(self)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_ = torch.nn.functional.relu(x)
+        return x_ * x_
+class StarReLU(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        deprecated_function(self)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_ = torch.nn.functional.relu(x)
+        return 0.8944 * x_ * x_ - 0.4472
+class SmeLU(nn.Module):
+    def __init__(self, beta: float = 2.0) -> None:
+        super().__init__()
+        self.beta = beta
+        deprecated_function(self)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        relu = torch.where(
+            x >= self.beta,
+            x,
+            torch.tensor([0.0], device=x.device, dtype=x.dtype),
+        )
+        return torch.where(
+            torch.abs(x) <= self.beta,
+            ((x + self.beta) ** 2).type_as(x) / (4.0 * self.beta),
+            relu,
+        )
+def build_activation(activation: Optional[Activation]):
+    if not activation:
+        return nn.Identity()
+    return {
+        Activation.ReLU: nn.ReLU,
+        Activation.GeLU: nn.GELU,
+        Activation.LeakyReLU: nn.LeakyReLU,
+        Activation.SquaredReLU: SquaredReLU,
+        Activation.StarReLU: StarReLU,
+        Activation.SmeLU: SmeLU,
+    }[activation]()

.venv/lib/python3.11/site-packages/xformers/components/attention/attention_patterns.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import List
+import numpy as np
+import torch
+from xformers.components.attention.sparsity_config import (
+    BigBirdSparsityConfig,
+    BSLongformerSparsityConfig,
+    FixedSparsityConfig,
+    VariableSparsityConfig,
+)
+# generic nd cases
+def _generate_nd_grid(*sizes):
+    coords = [torch.arange(s) for s in sizes]
+    return torch.meshgrid(*coords)
+def local_nd_distance(*sizes, p=2.0, weights=None):
+    if weights is None:
+        weights = (1,) * len(sizes)
+    assert len(sizes) == len(weights)
+    grid = _generate_nd_grid(*sizes)
+    grid = [i.flatten() * w for i, w in zip(grid, weights)]
+    grid = torch.stack(grid, dim=1).float()
+    d = torch.cdist(grid, grid, p=p)
+    return d
+def local_nd_gaussian_distribution(*sizes, sigma=1):
+    d = local_nd_distance(*sizes, p=2.0) ** 2
+    d = torch.exp(-0.5 * sigma ** (-2.0) * d)
+    return d
+def local_nd_pattern(*sizes, distance, p=2.0):
+    d = local_nd_distance(*sizes, p=p)
+    return d < distance
+def axial_nd_pattern(*sizes):
+    # axial is a special case with p=0 and distance=2
+    d = local_nd_distance(*sizes, p=0)
+    return d < 2
+def random_pattern_from_probability_matrix(dist_matrix, nnz):
+    att = torch.zeros_like(dist_matrix, dtype=torch.bool)
+    # PyTorch multinomial wrongly doesn't support sampling when number of categories
+    # is > 2^24, arguing that it's because it's the max representable consecutive element
+    # in fp32 and that the kernels use float32. This is actually not true, and the kernels
+    # should work fine if double tensor is passed on CPU. This is a bug that was introduced
+    # in https://github.com/pytorch/pytorch/commit/bf04c2ca2f591d98ce57816f0ef0cd20a21bbf66
+    # when unifying the checks between CPU and CUDA. For now, just fall-back to numpy
+    if dist_matrix.numel() > 2**24:
+        dist_matrix = dist_matrix.double()
+        dist_matrix /= dist_matrix.sum()
+        idxs = np.random.choice(
+            dist_matrix.numel(), nnz, p=dist_matrix.flatten(), replace=False
+        )
+        idxs = torch.as_tensor(idxs)
+    else:
+        idxs = torch.multinomial(dist_matrix.flatten(), nnz, replacement=False)
+    att.view(-1)[idxs] = True
+    return att
+def global_token_pattern(attention_query_mask: torch.Tensor) -> torch.Tensor:
+    assert attention_query_mask.ndim == 1
+    assert attention_query_mask.dtype == torch.bool
+    attention_query_mask = attention_query_mask[None, :]
+    mask = attention_query_mask | attention_query_mask.transpose(1, 0)
+    return mask
+def random_pattern(attn_size: int, sparsity: float) -> torch.Tensor:
+    assert 0 < sparsity < 1
+    mask = torch.rand(attn_size, attn_size) > sparsity
+    return mask
+# 1d-specific cases
+def local_1d_pattern(attn_size: int, window_size: int) -> torch.Tensor:
+    assert (
+        window_size % 2 == 1
+    ), "The window size is assumed to be odd (counts self-attention + 2 wings)"
+    h_win_size = window_size // 2 + 1
+    return local_nd_pattern(attn_size, distance=h_win_size, p=1.0)
+def causal_1d_pattern(attn_size: int) -> torch.Tensor:
+    mask = torch.tril(torch.ones(attn_size, attn_size, dtype=torch.bool))
+    return mask
+# 2d-specific cases
+def horizontal_axial_2d_distance(H, W, p=2.0):
+    d = local_nd_distance(H, W, p=p, weights=(1, 0))
+    return d
+def vertical_axial_2d_distance(H, W, p=2.0):
+    d = local_nd_distance(H, W, p=p, weights=(0, 1))
+    return d
+def local_2d_distance(H, W, p=2.0):
+    return local_nd_distance(H, W, p=p)
+def local_2d_gausian_distribution(H, W, sigma=1):
+    return local_nd_gaussian_distribution(H, W, sigma=sigma)
+def local_2d_pattern(H, W, distance, p=2.0):
+    return local_nd_pattern(H, W, distance=distance, p=p)
+def axial_2d_pattern(H, W):
+    return axial_nd_pattern(H, W)
+def swin_attention_pattern(H, W, window_size, shift_size=0):
+    assert H % window_size == 0
+    assert W % window_size == 0
+    assert 0 <= shift_size < window_size, "shift_size must in 0-window_size"
+    # input grid
+    i, j = _generate_nd_grid(H, W)
+    i, j = i + 0.5, j + 0.5
+    # anchors grid
+    # if shift is present, add extra element to the grid
+    # to account for the uneven partitioning
+    extra = int(shift_size % window_size != 0)
+    grid_h = H // window_size + extra
+    grid_w = W // window_size + extra
+    ii, jj = _generate_nd_grid(grid_h, grid_w)
+    # convert shift to be compatible with the paper representation
+    s = (-shift_size) % window_size
+    offset = window_size / 2 - s
+    ii = ii * window_size + offset
+    jj = jj * window_size + offset
+    input_coords = torch.stack([i.flatten(), j.flatten()], 1).float()
+    anchors_coords = torch.stack([ii.flatten(), jj.flatten()], 1).float()
+    anchor_id = torch.cdist(input_coords, anchors_coords, p=2).argmin(1)
+    mask = anchor_id[:, None] == anchor_id[None, :]
+    return mask
+def dilated_2d_pattern(H, W, k=2):
+    """
+    Returns a 2d pattern that samples 1 every k elements in the attention mask.
+    Can be seen as a form of downsampling, where every pixel attends to a downsampled
+    version of the input.
+    """
+    d_h = local_nd_distance(H, W, p=1, weights=(1, 0))
+    d_w = local_nd_distance(H, W, p=1, weights=(0, 1))
+    d = (d_h.floor() % k == 0) & (d_w.floor() % k == 0)
+    return d
+# Block sparse utils
+def block_sparsify_tensor(x, mask, block_size):
+    """
+    Block sparsify a tensor, given a mask and block size
+    """
+    ret = torch.empty(
+        (x.size(0), mask.sum(), block_size, block_size), dtype=x.dtype, device=x.device
+    )
+    for idx, (h, i, j) in enumerate(zip(*mask.nonzero(as_tuple=True))):
+        ret[:, idx, :, :] = x[
+            :,
+            h,
+            i * block_size : (i + 1) * block_size,
+            j * block_size : (j + 1) * block_size,
+        ]
+    return ret
+def pattern_to_layout(mask: torch.Tensor, block_size: int) -> torch.Tensor:
+    r"""
+    Given a mask pattern and blocksize, return the corresponding layout
+    which makes sure that all the positives in the mask are covered
+    """
+    assert mask.ndim >= 2, "We're expecting [Heads, Seq, Seq] or [Seq, Seq]"
+    _should_squeeze = False
+    if mask.ndim == 2:
+        mask = mask.unsqueeze(0)
+        _should_squeeze = True
+    assert (
+        mask.shape[1] % block_size == 0 and mask.shape[2] % block_size == 0
+    ), "We're only handling masks divisible by block_size"
+    # Now mark the mask
+    layout = torch.nn.functional.max_pool2d(
+        mask.to(torch.float), kernel_size=block_size, stride=block_size
+    )
+    layout = layout.to(torch.long)
+    if _should_squeeze:
+        layout.squeeze_(0)
+    return layout
+def alibi_pattern(threshold: float, mask_shape: torch.Size) -> torch.Tensor:
+    r"""
+    Use the additive bias computation from ALiBi_ to generate a mask.
+    Note that this mask can in turn be used to generate a blocksparse attention computation layout
+    .. note: mask_shape is expected to hold the [heads, seq, seq] dimensions
+    .. _ALiBi: https://arxiv.org/pdf/2108.12409.pdf
+    """
+    # CREDITS: code snippet from Ofir Press, one of the authors
+    def get_slopes(n: int):
+        def get_slopes_power_of_2(n: int) -> List[float]:
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            ratio = start
+            return [start * ratio**i for i in range(n)]
+        # In the paper, we only train models that have 2^a heads for some a. This function has
+        # some good properties that only occur when the input is a power of 2. To maintain that even
+        # when the number of heads is not a power of 2, we use this workaround.
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(n)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            return (
+                get_slopes_power_of_2(closest_power_of_2)
+                + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+            )
+    maxpos = mask_shape[1]
+    attn_heads = mask_shape[0]
+    slopes = torch.Tensor(get_slopes(attn_heads))
+    # In the next line, the part after the * is what constructs the diagonal matrix
+    # (right matrix in Figure 3 in the paper).
+    # If you run it you'll see that it doesn't exactly print out the same matrix as we have in Figure 3,
+    # but one where all rows are identical.
+    # This works because the softmax operation is invariant to translation,
+    # and our bias functions are always linear.
+    alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(maxpos).unsqueeze(
+        0
+    ).unsqueeze(0).expand(attn_heads, -1, -1)
+    alibi = alibi.view(attn_heads, 1, maxpos)
+    # Now threshold arbitrarily, report the mask
+    return alibi < threshold
+def quick_fixed_layout(num_heads: int, block_size: int, seq_len: int):
+    config = FixedSparsityConfig(num_heads=num_heads, block_size=block_size)
+    return config.make_layout(seq_len)
+def quick_variable_layout(num_heads: int, block_size: int, seq_len: int):
+    config = VariableSparsityConfig(num_heads=num_heads, block_size=block_size)
+    return config.make_layout(seq_len)
+def quick_bigbird_layout(num_heads: int, block_size: int, seq_len: int):
+    config = BigBirdSparsityConfig(num_heads=num_heads, block_size=block_size)
+    return config.make_layout(seq_len)
+def quick_bslongformer_layout(num_heads: int, block_size: int, seq_len: int):
+    config = BSLongformerSparsityConfig(num_heads=num_heads, block_size=block_size)
+    return config.make_layout(seq_len)
+def layout_to_pattern(layout: torch.Tensor, block_size: int):
+    r"""
+    create a pattern of shape [heads, seq, seq] out of a blocksparse
+    layout of shape [heads, seq/block_size, seq/block_size]
+    """
+    return torch.kron(layout, torch.ones(block_size, block_size))

.venv/lib/python3.11/site-packages/xformers/components/attention/core.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+from contextlib import nullcontext
+from typing import Optional, Union
+import torch
+from xformers import _has_cpp_library
+from xformers.components.attention.attention_mask import AttentionMask
+if _has_cpp_library:
+    from ._sputnik_sparse import SparseCS
+logger = logging.getLogger("xformers")
+def _create_random_sparsity(matrix, sparsity, divisible_by=4):
+    assert matrix.ndim == 3
+    keep = torch.rand_like(matrix[0], dtype=torch.float32) > sparsity
+    nonzero = torch.nonzero(keep)
+    nnz = nonzero.shape[0]
+    # NOTE: need to make it a multiple of 4 for sputnik
+    nonzero = nonzero[: (nnz - nnz % divisible_by)]
+    i, j = nonzero.unbind(1)
+    output = torch.zeros_like(matrix)
+    bdim = torch.arange(matrix.shape[0], device=matrix.device)[:, None]
+    output[bdim, i, j] = matrix[bdim, i, j]
+    return output
+def _broadcast_batch(mask, batch_size):
+    if mask.ndim == 3:
+        return mask
+    assert mask.ndim == 2
+    mask = mask.coalesce()
+    values = mask.values()
+    indices = mask.indices()
+    nnz = len(values)
+    # strategy: repeat the indices and append the extra batch dimension to the indices
+    indices = indices.repeat(1, batch_size)
+    # now create the batch indices
+    batch_indices = torch.arange(batch_size, device=indices.device)
+    batch_indices = batch_indices[:, None].expand(batch_size, nnz).flatten()
+    # put them together
+    indices = torch.cat([batch_indices[None, :], indices], dim=0)
+    # now repeat the values
+    values = values.repeat(batch_size)
+    size = (batch_size,) + mask.shape
+    return torch.sparse_coo_tensor(indices, values, size)
+def _matmul_with_mask(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    mask: Optional[Union[torch.Tensor, "SparseCS"]],
+) -> torch.Tensor:
+    if mask is None:
+        return a @ b
+    if _has_cpp_library and mask.dtype == torch.bool:
+        if isinstance(mask, SparseCS):
+            return mask.matmul_with_mask(a, b)
+        if mask.is_sparse:
+            # perform broadcasting if needed
+            mask = _broadcast_batch(mask, a.shape[0])
+            # coalesced is not implemented for bool tensors, so need to cast
+            mask = mask.to(dtype=a.dtype)  # type: ignore  # mypy is missing the catch above
+        return torch.ops.xformers.matmul_with_mask(a, b, mask)
+    # Non optimized codepath
+    if _has_cpp_library:
+        assert not isinstance(mask, SparseCS)
+    att = a @ b
+    if mask.dtype == torch.bool:
+        assert not isinstance(mask, SparseCS)
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).expand(att.shape[0], -1, -1)
+        # mask is presumed false == ignore
+        att[~mask] = float("-inf")
+    else:
+        # mask is presumed additive
+        # repeat if batch sizes don't match
+        if (
+            not isinstance(mask, SparseCS)
+            and mask.ndim == 3
+            and mask.shape[0] != att.shape[0]
+            and (att.shape[0] % mask.shape[0]) == 0
+        ):
+            repeat_factor = att.shape[0] // mask.shape[0]
+            mask = mask.repeat([repeat_factor, 1, 1])
+            logger.info("Mismatched batch dimensions for mask, repeating mask.")
+        att += mask
+    return att
+def _softmax(a: torch.Tensor, causal: bool = False) -> torch.Tensor:
+    if _has_cpp_library and isinstance(a, SparseCS):
+        return a.softmax()
+    if a.is_sparse:
+        return torch.sparse.softmax(a, dim=a.ndim - 1)
+    return torch.softmax(a, dim=a.ndim - 1)
+if _has_cpp_library:
+    class SparseBMM(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, a, b):
+            a = a.coalesce()
+            r = torch.bmm(a, b)
+            ctx.save_for_backward(a, b)
+            return r
+        @staticmethod
+        def backward(ctx, grad):
+            a, b = ctx.saved_tensors
+            # gradients w.r.t. a
+            ga = None
+            if ctx.needs_input_grad[0]:
+                ga = torch.ops.xformers.matmul_with_mask(grad, b.transpose(-2, -1), a)
+            # gradients w.r.t. b
+            gb = None
+            if ctx.needs_input_grad[1]:
+                gb = a.transpose(1, 2).bmm(grad)
+            return ga, gb
+    def _sparse_bmm(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        """
+        Batch matrix multiply between a sparse matrix and a dense matrix
+        """
+        assert a.ndim == b.ndim == 3
+        assert a.shape[0] == b.shape[0]
+        assert a.shape[2] == b.shape[1]
+        return SparseBMM.apply(a, b)
+def bmm(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    if _has_cpp_library:
+        if isinstance(a, SparseCS):
+            return a.spmm(b)
+        if a.is_sparse:
+            return _sparse_bmm(a, b)
+    return a @ b
+def _apply_dropout(att, dropout):
+    if dropout is None:
+        return att
+    # Dropout chokes on sparse tensors
+    if _has_cpp_library:
+        if isinstance(att, SparseCS):
+            values = att.values.clone()
+            values = dropout(values)
+            att = SparseCS.wrap(
+                att.shape,
+                values,
+                att.row_indices,
+                att.row_offsets,
+                att.column_indices,
+                att._transp_info,
+            )
+        elif att.is_sparse:
+            att = att.coalesce()
+            values = att.values().clone()  # protect against in-place dropout
+            values = dropout(values)
+            att = torch.sparse_coo_tensor(att.indices(), values, att.shape)
+        else:
+            # Simple dense case
+            att = dropout(att)
+        return att
+    # Non optimized vanilla dropout
+    att = dropout(att)
+    return att
+def scaled_query_key_softmax(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    att_mask: Optional[Union[AttentionMask, "SparseCS", torch.Tensor]],
+) -> torch.Tensor:
+    # TODO assume we have (N, S, hs) instead of (B, nh, S, hs), with N = B x nh
+    # this is needed due to limitations in sparse_bmm for now
+    # Self-attend: (N, S, hs) x (N, hs, S) -> (N, S, S)
+    q = q / math.sqrt(k.size(-1))
+    # Matmul with mask
+    if att_mask is not None and isinstance(att_mask, AttentionMask):
+        # Additive mask
+        mask: Optional[Union[SparseCS, torch.Tensor]] = att_mask.values
+    else:
+        mask = att_mask
+    att = _matmul_with_mask(q, k.transpose(-2, -1), mask)
+    # Softmax to get the attention probabilities
+    is_causal = isinstance(att_mask, AttentionMask) and att_mask.is_causal
+    att = _softmax(att, causal=is_causal)
+    return att
+def scaled_dot_product_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    att_mask: Optional[Union[AttentionMask, "SparseCS", torch.Tensor]],
+    dropout: Optional[torch.nn.Module] = None,
+) -> torch.Tensor:
+    autocast_disabled = (
+        _has_cpp_library
+        and isinstance(att_mask, SparseCS)
+        or (att_mask is not None and att_mask.is_sparse)
+    )
+    with torch.amp.autocast("cuda", enabled=False) if autocast_disabled else nullcontext():  # type: ignore
+        if autocast_disabled:
+            q, k, v = q.float(), k.float(), v.float()
+        att = scaled_query_key_softmax(q, k, att_mask=att_mask)
+        #  Optional dropout, could be part of the masking in the future
+        att = _apply_dropout(att, dropout)
+        # Get to the predicted values, for all heads
+        # y = att @ v  # (N, S, S) x (N, S, hs) -> (N, S, hs)
+        y = bmm(att, v)
+    return y

.venv/lib/python3.11/site-packages/xformers/components/attention/favor.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+from torch.amp import autocast
+from xformers.components.attention import Attention, AttentionConfig, register_attention
+from xformers.components.attention.feature_maps import (
+    FeatureMap,
+    FeatureMapType,
+    SMHyperbolic,
+    SMOrf,
+    SMReg,
+)
+logger = logging.getLogger("xformers")
+@dataclass
+class FavorAttentionConfig(AttentionConfig):
+    causal: Optional[bool]
+    dim_features: Optional[int] = None  # The dimensions of the random features
+    dim_head: Optional[
+        int
+    ] = None  # The embedding dimension of the inputs. Only useful to get a dim_features estimate
+    iter_before_redraw: Optional[
+        int
+    ] = None  # The number of iterations before the random features are re-drawn from scratch
+    feature_map: Optional[FeatureMapType] = None
+@register_attention("favor", FavorAttentionConfig)
+class FavorAttention(Attention):
+    def __init__(
+        self,
+        causal: bool = False,
+        dropout: float = 0.0,
+        dim_features: Optional[int] = None,
+        dim_head: Optional[int] = None,
+        iter_before_redraw: Optional[int] = None,
+        feature_map_type: FeatureMapType = FeatureMapType.SMReg,
+        normalize_inputs: bool = False,
+        *_,
+        **__,
+    ):
+        r"""
+        Kernelized attention, as proposed in Performers_
+        ("Rethinking attention with performers." K. Choromanski et al. (2020).).
+        FAVOR stands for "Fast Attention Via positive Orthogonal Random features"
+        Args:
+            dropout (float): the probability of an output to be randomly dropped at training time
+            dim_features (int): the dimension of the random features space
+            iter_before_redraw (int): the number of steps (forward calls) before a redraw of the features
+            feature_map_type (FeatureMapType): the type of feature map being used,
+            for instance orthogonal random features.
+        .. _Performers: https://arxiv.org/pdf/2009.14794v1.pdf
+        """
+        super().__init__()
+        self.causal = causal
+        self.iter_before_redraw = (
+            (2 * iter_before_redraw)
+            if iter_before_redraw is not None
+            else iter_before_redraw
+        )  # This will be used for both key and query
+        self.normalize_inputs = normalize_inputs
+        self.feature_map_type = feature_map_type
+        self.attn_drop = nn.Dropout(dropout, inplace=True)
+        # Setup dimension-dependent variables
+        # Reasonable dimension default
+        if dim_features is None:
+            assert dim_head is not None, "dim_features or dim_head needs to be passed"
+            self.dim_features = math.ceil(dim_head * (1 + math.log2(dim_head)))
+            self.dim_features = 2 * (
+                self.dim_features // 2
+            )  # needs to be even for some variants
+            logger.info(
+                f"FAVOR: Automatically setting the random mapping dimension to {self.dim_features} from {dim_head}"
+            )
+        else:
+            self.dim_features = dim_features
+        feature_map_constructor = {
+            FeatureMapType.SMHyp: SMHyperbolic,
+            FeatureMapType.SMReg: SMReg,
+            FeatureMapType.SMOrf: SMOrf,
+        }[self.feature_map_type]
+        feature_settings = {
+            "dim_features": self.dim_features,
+            "iter_before_redraw": self.iter_before_redraw,
+            "normalize_inputs": self.normalize_inputs,
+        }
+        self.feature_map: FeatureMap = feature_map_constructor(**feature_settings)  # type: ignore
+        # Properties specific to this attention mechanism
+        self.supports_attention_mask = False
+        self.supports_key_padding_mask = False
+    @staticmethod
+    def _maybe_promote(x: torch.Tensor) -> torch.Tensor:
+        # Only promote fp16 buffers, bfloat16 would be fine for instance
+        return x.float() if x.dtype == torch.float16 else x
+    @staticmethod
+    def _causal_attention(
+        k_prime: torch.Tensor, q_prime: torch.Tensor, v: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Algorithm 1 in the paper
+        ref_v = torch.ones_like(v.unsqueeze(2))  # BATCH x SEQ x 1 x EMB
+        Gps = k_prime.unsqueeze(3) * v.unsqueeze(2)
+        Grenorm = k_prime.unsqueeze(3) * ref_v
+        # Consolidate against the feature dimension
+        att_raw = torch.einsum("bcfe,bcf->bce", Gps, q_prime)
+        att_norm = torch.einsum("bcfe,bcf->bce", Grenorm, q_prime)
+        # Cumulative sum over the sequence
+        att_raw = att_raw.cumsum(2)
+        att_norm = att_norm.cumsum(2)
+        return att_raw, att_norm
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        *_,
+        **__,
+    ):
+        # Project key and queries onto the feature map space
+        k_prime = self.feature_map(k)
+        q_prime = self.feature_map(q)
+        with autocast("cuda", enabled=False):
+            # The softmax kernel approximation for Favor will easily overflow
+            # Force the computations here to stay in fp32 for numerical stability
+            # Note that the dimensions are vastly reduced when compared to scaled_dot_product
+            k_prime = self._maybe_promote(k_prime)
+            q_prime = self._maybe_promote(q_prime)
+            v = self._maybe_promote(v)
+            if not self.causal:
+                att_normalization = q_prime @ (
+                    k_prime.transpose(-2, -1) @ torch.ones_like(v)
+                )
+                att_raw = q_prime @ (k_prime.transpose(-2, -1) @ v)
+            else:
+                # Actually compute attention
+                att_raw, att_normalization = self._causal_attention(k_prime, q_prime, v)
+            # Normalize
+            att = att_raw / att_normalization
+        if self.attn_drop is not None:
+            att = self.attn_drop(att)
+        return att

.venv/lib/python3.11/site-packages/xformers/components/attention/fourier_mix.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch.amp import autocast
+from xformers.components.attention import Attention, AttentionConfig, register_attention
+@register_attention("fourier_mix", AttentionConfig)
+class FourierMix(Attention):
+    def __init__(self, dropout: float, *_, **__):
+        """
+        FFT-based pseudo-attention mechanism, from
+        "
+        "FNet: Mixing Tokens with Fourier Transforms"
+        Lee-Thorp et al., 2021, https://arxiv.org/pdf/2105.03824.pdf
+        """
+        super().__init__()
+        self.attn_drop = torch.nn.Dropout(dropout, inplace=False)
+        # Properties specific to this attention mechanism
+        self.supports_attention_mask = False
+        self.requires_input_projection = False
+    def forward(self, q: torch.Tensor, *_, **__):
+        # Guard against autocast / fp16, not supported by torch.fft.fft2
+        with autocast("cuda", enabled=False):
+            att = torch.fft.fft2(q).real
+        att = self.attn_drop(att)
+        return att

.venv/lib/python3.11/site-packages/xformers/components/attention/lambda_layer.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+import torch
+from xformers.components.attention import Attention, AttentionConfig, register_attention
+def calc_rel_pos(n: int):
+    # Adapted from LucidRains
+    # https://github.com/lucidrains/lambda-networks/blob/main/lambda_networks/lambda_networks.py
+    rel_pos = torch.arange(n)[None, :] - torch.arange(n)[:, None]  # [n, n]
+    rel_pos += n - 1  # shift value range from [-n+1, n-1] to [0, 2n-2]
+    return rel_pos
+@dataclass
+class LambdaLayerConfig(AttentionConfig):
+    seq_len: int  # dimension of the input sequence
+    dim_head: int
+@register_attention("lambda", LambdaLayerConfig)
+class LambdaLayer(Attention):
+    def __init__(self, dropout: float, seq_len: int, dim_head: int, *_, **__):
+        """
+        Attention approximation using Lambda layers, from
+        "Lambda networks: modeling long-range interactions without attention.", Bello, I. (2021).
+        """
+        super().__init__()
+        # Possible extensions:
+        # - support different dimensions for key and queries
+        # - support varying dimensions in between inputs and outputs
+        # - support u hyperparam
+        self.rel_pos_emb = torch.nn.Parameter(
+            torch.randn(2 * seq_len - 1, int(dim_head))
+        )
+        self.rel_pos = calc_rel_pos(seq_len)
+        self.attn_drop = torch.nn.Dropout(dropout, inplace=True)
+        # Properties specific to this attention mechanism
+        self.requires_same_k_q_dimensions = True
+        self.supports_attention_mask = False
+        self.supports_key_padding_mask = False
+    def forward(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, *args, **kwargs
+    ):
+        """..NOTE: We're reusing the einsum notation suggested by the paper, changed in that
+        heads are folded in the batch dimension"""
+        content_lambda = torch.einsum("bnk,bnv->bkv", torch.softmax(k, dim=-1), v)
+        content_output = torch.einsum("bnk,bkv->bnv", q, content_lambda)
+        rel_pos_emb = self.rel_pos_emb[self.rel_pos]
+        # Handle real sequence length being possibly smaller
+        seq_len = q.shape[1]
+        rel_pos_emb = rel_pos_emb[:seq_len, :seq_len, :]
+        # Compute the position lambda for every possible combination in one go, then compute the
+        # position related contribution
+        position_lambdas = torch.einsum(
+            "mnk,bnv->bnkv", rel_pos_emb, v
+        )  # one lambda per position
+        position_output = (q.unsqueeze(2) @ position_lambdas).squeeze()
+        att = content_output + position_output
+        att = self.attn_drop(att)
+        return att

.venv/lib/python3.11/site-packages/xformers/components/attention/local.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from xformers.components.attention import (
+    Attention,
+    AttentionConfig,
+    AttentionMask,
+    maybe_sparsify,
+    register_attention,
+    sparsify,
+)
+from xformers.components.attention.attention_patterns import (
+    causal_1d_pattern,
+    local_1d_pattern,
+)
+from xformers.components.attention.core import scaled_dot_product_attention
+@dataclass
+class LocalAttentionConfig(AttentionConfig):
+    causal: Optional[bool] = None
+    window_size: Optional[int] = None
+    force_sparsity: Optional[bool] = None
+@register_attention("local", LocalAttentionConfig)
+class LocalAttention(Attention):
+    def __init__(
+        self,
+        dropout: float = 0.0,
+        causal: bool = False,
+        window_size: int = 5,
+        force_sparsity: bool = False,
+        *args,
+        **kwargs,
+    ):
+        r"""
+        An implementation of a sliding window attention, as proposed in RoutingTransformer_, LongFormer_ or BigBird_
+        Args:
+            dropout (float): the probability of an output to be randomly dropped at training time
+            causal (bool): apply a causal mask, in that the attention cannot be applied to the future
+            window_size (int): the overall window size for local attention.
+                Odd number is expected if the mask is not causal, as the window size will be evenly
+                distributed on both sides of each query
+        .. _RoutingTransformer: https://arxiv.org/pdf/2003.05997.pdf
+        .. _BigBird: https://arxiv.org/pdf/2007.14062.pdf
+        .. _Longformer: https://arxiv.org/pdf/2004.05150.pdf
+        """
+        super().__init__()
+        self.attn_drop = nn.Dropout(dropout, inplace=False)
+        self.causal = causal
+        self.force_sparsity = force_sparsity
+        if not self.causal:
+            assert (
+                window_size % 2 == 1
+            ), "The window size is assumed to be odd (counts self-attention + 2 wings)"
+        self.window_size = window_size
+        self.attention_mask: Optional[torch.Tensor] = None
+        self.requires_same_k_q_dimensions = True
+        # Properties specific to this attention mechanism
+        self.supports_attention_mask = True
+        self.supports_key_padding_mask = False
+    def _get_local_mask(self, shape: torch.Size) -> torch.Tensor:
+        window_size = self.window_size * 2 + 1 if self.causal else self.window_size
+        mask = local_1d_pattern(shape[1], window_size)
+        if self.causal:
+            mask &= causal_1d_pattern(shape[1])
+        mask = sparsify(mask) if self.force_sparsity else maybe_sparsify(mask)
+        return mask
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        att_mask: Optional[Union[torch.Tensor, AttentionMask]] = None,
+        *args,
+        **kwargs,
+    ):
+        # Local window attention masking
+        if self.attention_mask is None or self.attention_mask.shape[1] != q.shape[1]:
+            self.attention_mask = self._get_local_mask(q.shape).to(q.device)
+        # Take into account the optional user mask
+        if att_mask is None:
+            mask = self.attention_mask
+        else:
+            if isinstance(att_mask, AttentionMask):
+                # Needed because & op not defined for SparseCS with AttentionMask
+                att_mask = att_mask.to_bool()
+            mask = self.attention_mask & att_mask
+        return scaled_dot_product_attention(
+            q=q, k=k, v=v, att_mask=mask, dropout=self.attn_drop
+        )

.venv/lib/python3.11/site-packages/xformers/components/attention/nystrom.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from xformers.components.attention import Attention, AttentionConfig, register_attention
+from xformers.components.attention.core import (
+    scaled_dot_product_attention,
+    scaled_query_key_softmax,
+)
+from xformers.components.attention.utils import (
+    bool_mask_to_additive,
+    iterative_pinv,
+    reshape_key_padding_mask,
+)
+logger = logging.getLogger("xformers")
+@dataclass
+class NystromSelfAttentionConfig(AttentionConfig):
+    """
+    num_heads               Number of heads.
+    num_landmarks           Number of landmarks to use for softmax approximation. 64 often sufficient for a good
+                            approximation according to https://arxiv.org/pdf/2102.03902.pdf.
+    causal                  Apply a causal mask, in that the attention cannot be applied to the future.
+    use_razavi_pinverse     If true, use iterative method from (Razavi et al. 2014) to approximate the Moore-Penrose
+                            inverse, otherwise use standard torch inverse.
+    pinverse_original_init  True if using original initialization when calculating Moore-Penrose pseudo inverse using
+                            method from (Razavi et al. 2014).
+                            False if using exact coefficient computation (leads to faster convergence).
+    inv_iterations          Number of iterations for calculating the Moore-Penrose pseudo inverse.
+    v_skip_connection       A module that will take V as input and will be added as a skip connection to the
+                            softmax approximation. A skip connection is added in the paper to help with training.
+    conv_kernel_size        Kernel size for convolution optionally added to help in training.
+                            If v_skip_connection is not specified, this will be used to define the default
+                            depth wise convolution used as a skip connection.
+                            If both conv_kernel_size and v_skip_connection are None, no skip connection will
+                            be added.
+    landmark_pooling        Which module to use when computing landmarks. Default is AdaptiveAvgPool2d.
+    """
+    num_heads: int
+    num_landmarks: Optional[int]
+    landmark_pooling: Optional[nn.Module]
+    causal: Optional[bool]
+    pinverse_original_init: Optional[bool]
+    inv_iterations: Optional[int]
+    v_skip_connection: Optional[nn.Module]
+    conv_kernel_size: Optional[int]
+    use_razavi_pinverse: Optional[bool]
+class AvgPool(nn.Module):
+    def __init__(self, n: int):
+        super().__init__()
+        self.n = n
+    def forward(self, x: torch.Tensor):
+        # Average independently for every segment in the sequence dimension
+        seq_len = x.shape[1]
+        head_dim = x.shape[2]
+        segments = seq_len // self.n
+        assert segments > 0, "num_landmarks should be smaller than the sequence length"
+        # Dimensions are a match
+        if seq_len % self.n == 0:
+            return x.reshape(
+                -1,
+                self.n,
+                segments,
+                head_dim,
+            ).mean(dim=-2)
+        # Handle the last segment boundary being off
+        n_round = self.n - seq_len % self.n
+        x_avg_round = (
+            x[:, : n_round * segments, :]
+            .reshape(-1, n_round, segments, head_dim)
+            .mean(dim=-2)
+        )
+        x_avg_off = (
+            x[:, n_round * segments :, :]
+            .reshape(-1, self.n - n_round, segments + 1, head_dim)
+            .mean(dim=-2)
+        )
+        return torch.cat((x_avg_round, x_avg_off), dim=-2)
+@register_attention("nystrom", NystromSelfAttentionConfig)
+class NystromAttention(Attention):
+    # TODO: update defaults for use_razavi_pinverse and inv_iterations
+    def __init__(
+        self,
+        dropout: float,
+        num_heads: int,
+        num_landmarks: int = 64,
+        landmark_pooling: Optional[nn.Module] = None,
+        causal: bool = False,
+        use_razavi_pinverse: bool = True,
+        pinverse_original_init: bool = False,
+        inv_iterations: int = 6,  # recommended default in paper was 6.
+        v_skip_connection: Optional[nn.Module] = None,
+        conv_kernel_size: Optional[int] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Nystrom attention mechanism, from Nystromformer_.
+        ::
+            "A Nystrom-based Algorithm for Approximating Self-Attention."
+            Xiong, Y., Zeng, Z., Chakraborty, R., Tan, M., Fung, G., Li, Y., Singh, V. (2021)
+            Reference codebase: https://github.com/mlpen/Nystromformer
+        .. _Nystromformer: https://arxiv.org/pdf/2102.03902.pdf
+        """
+        super().__init__()
+        # merged key padding mask and attention mask is not accepted
+        self.requires_separate_masks = True
+        self.num_landmarks = num_landmarks
+        # TODO: should be able to not have to pass in num_heads
+        self.num_heads = num_heads
+        self.use_razavi_pinverse = use_razavi_pinverse
+        self.pinverse_original_init = pinverse_original_init
+        self.inv_iterations = inv_iterations
+        self.attn_drop = nn.Dropout(dropout)
+        self.skip_connection = v_skip_connection
+        self.causal = causal
+        if self.skip_connection is None and conv_kernel_size is not None:
+            self.skip_connection = nn.Conv2d(
+                in_channels=self.num_heads,
+                out_channels=self.num_heads,
+                kernel_size=(conv_kernel_size, 1),
+                padding=(conv_kernel_size // 2, 0),
+                bias=False,
+                groups=self.num_heads,
+            )
+        if landmark_pooling is not None:
+            self.landmark_pooling = landmark_pooling
+        else:
+            self.landmark_pooling = AvgPool(n=self.num_landmarks)
+        # Optional lower triangular masks for causal attention
+        self.causal_mask_1: Optional[torch.Tensor] = None
+        self.causal_mask_2: Optional[torch.Tensor] = None
+        self.causal_mask_3: Optional[torch.Tensor] = None
+        # This attention does not support attention masks
+        self.supports_attention_mask = False
+        self.supports_key_padding_mask = True
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        key_padding_mask: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ):
+        r"""
+        key_padding_mask    Only a key padding mask is accepted here. The size must be (batch size, sequence length) or
+                            (batch size * num_heads, 1, sequence length). If dimensions are not correct, the mask will
+                            be ignored. An additive mask is expected, meaning float values using "-inf" to mask values
+        """
+        batched_dim = k.size(0)
+        seq_len = k.size(-2)
+        tt = {"dtype": q.dtype, "device": q.device}
+        if key_padding_mask is not None:
+            if key_padding_mask.dtype == torch.bool:
+                logger.warning(
+                    "Bool mask found, but an additive mask is expected. Converting but this is slow"
+                )
+                key_padding_mask = bool_mask_to_additive(key_padding_mask)
+            if key_padding_mask.ndim == 2:
+                key_padding_mask = reshape_key_padding_mask(
+                    key_padding_mask, batched_dim
+                )
+            zeros = torch.zeros_like(key_padding_mask)
+            ones = torch.ones_like(key_padding_mask)
+            is_masked = torch.isinf(-key_padding_mask)
+            # _mask takes 1 if the token is not padded, otherwise 0.
+            _mask = torch.where(is_masked, zeros, ones)
+            _mask = _mask.transpose(2, 1)
+            assert _mask.shape == (batched_dim, q.shape[1], 1)
+            # Mask q and k before pooling
+            # https://github.com/mlpen/Nystromformer/blob/main/code/attention_nystrom.py#L31
+            q = q * _mask
+            k = k * _mask
+            assert key_padding_mask.size() == (batched_dim, 1, seq_len), (
+                f"key_padding_mask has invalid dimensions {key_padding_mask.size()}."
+                f" Must have dimensions {batched_dim, 1, seq_len} or (batch_size, {seq_len})."
+            )
+        if self.num_landmarks >= seq_len:
+            mask: Optional[torch.Tensor] = None
+            if self.causal:
+                mask = self._triu_mask(batched_dim, seq_len, seq_len, **tt)
+            if key_padding_mask is not None:
+                mask = key_padding_mask if mask is None else mask + key_padding_mask
+            x = scaled_dot_product_attention(q=q, k=k, v=v, att_mask=mask)
+        else:
+            q_landmarks = self.landmark_pooling(q)
+            k_landmarks = self.landmark_pooling(k)
+            if self.causal and (
+                self.causal_mask_1 is None
+                or (batched_dim, seq_len, self.num_landmarks)
+                != self.causal_mask_1.size()
+            ):
+                self.causal_mask_1 = self._triu_mask(
+                    batched_dim, seq_len, self.num_landmarks, **tt
+                )
+                self.causal_mask_2 = self._triu_mask(
+                    batched_dim, self.num_landmarks, self.num_landmarks, **tt
+                )
+                self.causal_mask_3 = self._triu_mask(
+                    batched_dim, self.num_landmarks, seq_len, **tt
+                )
+            mask_3: Optional[torch.Tensor] = self.causal_mask_3
+            if key_padding_mask is not None:
+                mask_3 = (
+                    key_padding_mask if mask_3 is None else mask_3 + key_padding_mask
+                )
+            kernel_1 = scaled_query_key_softmax(q=q, k=k_landmarks, att_mask=None)
+            kernel_2 = scaled_query_key_softmax(
+                q=q_landmarks, k=k_landmarks, att_mask=None
+            )
+            kernel_3 = scaled_dot_product_attention(
+                q=q_landmarks, k=k, v=v, att_mask=mask_3
+            )
+            kernel_2_inv = (
+                iterative_pinv(
+                    kernel_2, self.inv_iterations, self.pinverse_original_init
+                )
+                if self.use_razavi_pinverse
+                else torch.linalg.pinv(kernel_2)
+            )
+            x = torch.matmul(
+                torch.matmul(
+                    kernel_1,
+                    kernel_2_inv,
+                ),
+                kernel_3,
+            )
+        if self.skip_connection:
+            # Assumption here is that v is 3D.
+            v_conv = self.skip_connection(
+                v.reshape(-1, self.num_heads, v.size(-2), v.size(-1))
+            )
+            x += v_conv.reshape(-1, v_conv.size(-2), v_conv.size(-1))
+        x = self.attn_drop(x)
+        return x
+    def _triu_mask(self, dim_1: int, dim_2: int, dim_3: int, **kwargs) -> torch.Tensor:
+        device = kwargs["device"]
+        dtype = kwargs["dtype"]
+        return torch.triu(
+            torch.ones(dim_2, dim_3, dtype=dtype, device=device) * float("-inf"),
+            diagonal=1,
+        ).expand(
+            dim_1, -1, -1
+        )  # micro optim, save memory on the batch dimension

.venv/lib/python3.11/site-packages/xformers/components/attention/random.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from xformers.components.attention import (
+    Attention,
+    AttentionConfig,
+    AttentionMask,
+    maybe_sparsify,
+    register_attention,
+    sparsify,
+)
+from xformers.components.attention.attention_patterns import (
+    causal_1d_pattern,
+    random_pattern,
+)
+from xformers.components.attention.core import scaled_dot_product_attention
+@dataclass
+class RandomAttentionConfig(AttentionConfig):
+    r: Optional[
+        float
+    ]  # the ratio of keys that the query can attend to. 1.0 means dense attention
+    constant_masking: Optional[
+        bool
+    ]  # whether the randomness is per query or defined at construction time
+    force_sparsity: Optional[bool]  # use sparsity in any case (potentially slower)
+@register_attention("random", RandomAttentionConfig)
+class RandomAttention(Attention):
+    def __init__(
+        self,
+        dropout: float,
+        causal: bool = False,
+        r: float = 0.01,
+        constant_masking: bool = True,
+        force_sparsity: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        "Random" attention, as proposed for instance in BigBird_.
+        Random means in that case that each query can attend to a random set of keys.
+        This implementation is sparse-aware, meaning that the empty attention parts will not be represented in memory.
+        Args:
+            r (float): the ratio in [0,1] of keys that the query can attend to
+            constant_masking (bool): if true, keep the same random set for all queries.
+        .. _BigBird: https://arxiv.org/pdf/2007.14062.pdf
+        """
+        super().__init__()
+        self.attn_drop = nn.Dropout(dropout, inplace=False)
+        self.causal = causal
+        self.r = r
+        self.rand_attention_mask: Optional[torch.Tensor] = None
+        self.constant_masking = constant_masking
+        self.force_sparsity = force_sparsity
+        # Properties specific to this attention mechanism
+        self.supports_attention_mask = True
+        self.supports_key_padding_mask = False
+        self.requires_same_k_q_dimensions = True
+    def _get_rand_mask(self, shape: torch.Size) -> torch.Tensor:
+        sparsity = 1 - self.r
+        mask = random_pattern(shape[1], sparsity=sparsity)
+        if self.causal:
+            mask &= causal_1d_pattern(shape[1])
+        mask = sparsify(mask) if self.force_sparsity else maybe_sparsify(mask)
+        return mask
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        att_mask: Optional[Union[torch.Tensor, AttentionMask]] = None,
+        *args,
+        **kwargs,
+    ):
+        # Rand masking
+        if not self.constant_masking or self.rand_attention_mask is None:
+            self.rand_attention_mask = self._get_rand_mask(q.shape).to(q.device)
+        # Mask-aware attention
+        if att_mask is not None:
+            if att_mask.dtype == torch.bool and isinstance(
+                self.rand_attention_mask, AttentionMask
+            ):
+                mask = self.rand_attention_mask + AttentionMask.from_bool(att_mask)
+            else:
+                if isinstance(att_mask, AttentionMask):
+                    # Needed because & op not defined for SparseCS with AttentionMask
+                    att_mask = att_mask.to_bool()
+                mask = self.rand_attention_mask & att_mask
+        else:
+            mask = self.rand_attention_mask
+        # Handle q/k/v which would not fit the mask
+        seq_len = q.shape[-2]
+        q_, k_, v_ = map(lambda x: self._maybe_pad_sequence(x, mask), (q, k, v))
+        # Normal attention with the random mask
+        att = scaled_dot_product_attention(
+            q=q_, k=k_, v=v_, att_mask=mask, dropout=self.attn_drop
+        )
+        # Take into account an hypothetical padding
+        return att[:, :seq_len, :]

.venv/lib/python3.11/site-packages/xformers/components/attention/scaled_dot_product.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from dataclasses import dataclass
+from typing import Optional, Union
+import torch
+from torch import nn
+from xformers.components.attention import (
+    Attention,
+    AttentionConfig,
+    AttentionMask,
+    register_attention,
+)
+from xformers.components.attention.core import scaled_dot_product_attention
+logger = logging.getLogger("xformers")
+@dataclass
+class ScaledDotProductConfig(AttentionConfig):
+    causal: Optional[bool]
+    seq_len: Optional[int]
+    to_seq_len: Optional[int]
+@register_attention("scaled_dot_product", ScaledDotProductConfig)
+class ScaledDotProduct(Attention):
+    r"""
+    Implementing the Scaled Dot-Product attention proposed in
+    `Attention is all you need`_, Vaswani et al.
+    .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762v5
+    """
+    mask: Optional[AttentionMask]
+    def __init__(
+        self,
+        dropout: float = 0.0,
+        causal: bool = False,
+        seq_len: Optional[int] = None,
+        to_seq_len: Optional[int] = None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+        self.attn_drop = nn.Dropout(dropout, inplace=False)
+        self.causal = causal
+        self.seq_len = seq_len
+        if causal and seq_len is not None:
+            self.mask = AttentionMask.make_causal(seq_len, to_seq_len)
+        else:
+            self.mask = None
+        # Properties specific to this attention mechanism
+        self.supports_attention_mask = True
+        self.supports_key_padding_mask = False
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        att_mask: Optional[Union[AttentionMask, torch.Tensor]] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        att_mask    A 2D or 3D mask which ignores attention at certain positions.
+                    - If the mask is boolean, a value of True will keep the value,
+                        while a value of False will mask the value.
+                        Key padding masks (dimension: batch x sequence length) and attention masks
+                        (dimension: sequence length x sequence length OR batch x sequence length x sequence length)
+                        can be combined and passed in here. Method maybe_merge_masks provided in the utils can be
+                        used for that merging.
+                    - If the mask has the float type, then an additive mask is expected (masked values are -inf)
+        """
+        # Convenience, create an attention mask if a tensor was passed
+        if att_mask is not None and isinstance(att_mask, torch.Tensor):
+            # By default we don't know of the causality, and a check would be expensive
+            att_mask = (
+                AttentionMask.from_bool(att_mask)
+                if att_mask.dtype == torch.bool
+                else AttentionMask(att_mask, is_causal=False)
+            )
+        # Handle a possibly deferred causal mask handling
+        mask = self.mask
+        if self.causal and self.mask is None:
+            mask = AttentionMask.make_causal(
+                seq_len=q.shape[-2],
+                to_seq_len=q.shape[-2],
+                device=q.device,
+                dtype=q.dtype,
+            )
+        # Merge the optional causal mask and the user-provided mask
+        if mask is not None:
+            mask = mask.to(dtype=q.dtype, device=q.device)
+            att_mask = att_mask + mask if att_mask is not None else mask
+        # Try to handle a case where the sequence is smaller than the mask
+        if (
+            att_mask is not None
+            and q.shape[-2] == k.shape[-2]
+            and q.shape[-2] < att_mask.shape[1]
+        ):
+            if isinstance(att_mask, AttentionMask):
+                att_mask = att_mask.make_crop(seq_len=q.shape[-2])
+            else:
+                logger.error(
+                    "Mismatching sparse attention mask and sequence length."
+                    + " Please pad the inputs or adjust the attention mask"
+                )
+                raise NotImplementedError
+        # Attend: (B x nh, S, hs) x (B x nh, hs, S) -> (B x nh, S, S)
+        y = scaled_dot_product_attention(
+            q=q, k=k, v=v, att_mask=att_mask, dropout=self.attn_drop
+        )
+        return y

.venv/lib/python3.11/site-packages/xformers/components/attention/visual.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from xformers.components.attention import Attention, AttentionConfig, register_attention
+@dataclass
+class VisualAttentionConfig(AttentionConfig):
+    dim_model: int  # dimension of the input sequence
+class LKA(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.conv0 = nn.Conv2d(dim, dim, 5, padding=2, groups=dim)
+        self.conv_spatial = nn.Conv2d(
+            dim, dim, 7, stride=1, padding=9, groups=dim, dilation=3
+        )
+        self.conv1 = nn.Conv2d(dim, dim, 1)
+    def forward(self, x: torch.Tensor):
+        u = x.clone()
+        attn = self.conv0(x)
+        attn = self.conv_spatial(attn)
+        attn = self.conv1(attn)
+        return u * attn
+@register_attention("visual", VisualAttentionConfig)
+class Visual(Attention):
+    def __init__(
+        self,
+        dim_model: int,
+        *_,
+        **__,
+    ):
+        """
+        Large kernel attention mechanism, as proposed in `Visual Attention Network`_, Guo et al (2022).
+        The original notation is tentatively kept as is. See https://github.com/Visual-Attention-Network
+        for the reference implementation
+        .. Note: compared to the paper, this block contains the LKA (Large Kernel Attention)
+            and the prior and posterior transformations (Conv2d and activation)
+        .. _`Visual Attention Network` : https://arxiv.org/pdf/2202.09741.pdf
+        """
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.Conv2d(dim_model, dim_model, 1),
+            nn.GELU(),
+            LKA(dim_model),
+            nn.Conv2d(dim_model, dim_model, 1),
+        )
+        # MHA related flags:
+        self.requires_same_k_q_dimensions = (
+            True  # This mechanism only really supports self attention
+        )
+        self.supports_attention_mask = False
+        self.requires_skip_multi_head = (
+            True  # This mechanism skips the multihead attention altogether
+        )
+        self.requires_squared_context = (
+            True  # Recovering the 2D structure from context assumes squared content
+        )
+        self.requires_input_projection = (
+            False  # This mechanism does not require that the MHA projects inputs
+        )
+    def forward(self, q: torch.Tensor, *_, **__):
+        # Expose the 2D token structure
+        B, HW, C = q.shape
+        H = int(math.sqrt(HW))
+        assert H * H == HW
+        x = q.transpose(-2, -1).reshape(B, C, H, H)
+        # Large kernel attention
+        residual = x.clone()
+        x = self.block(x)
+        x = x + residual
+        # Get back to B HW C
+        return x.flatten(2, 3).transpose(-2, -1)

.venv/lib/python3.11/site-packages/xformers/components/input_projection.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+# CREDITS: Inspired by https://github.com/pytorch/text/blob/master/torchtext/nn/modules/multiheadattention.py
+# and the MultiHeadAttention implementation from PyTorch
+import logging
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from torch import nn
+from xformers._deprecation_warning import deprecated_function
+logger = logging.getLogger("xformers")
+@dataclass
+class InputProjectionConfig:
+    in_features: int
+    out_features: int
+    bias: bool
+class InputProjection(nn.Module):
+    """
+    Handle all the input projections in one go, opportunistically fuse some operations.
+    """
+    def __init__(
+        self,
+        query_proj_params: InputProjectionConfig,
+        key_proj_params: Optional[InputProjectionConfig],
+        value_proj_params: Optional[InputProjectionConfig],
+        use_separate_proj_weight: bool = True,
+    ):
+        super().__init__()
+        deprecated_function(self)
+        self.out_features = query_proj_params.out_features
+        # Each input gets a separate projection
+        self.q_proj = nn.Linear(
+            query_proj_params.in_features,
+            query_proj_params.out_features,
+            query_proj_params.bias,
+        )
+        if key_proj_params is not None:
+            self.k_proj = nn.Linear(
+                key_proj_params.in_features,
+                key_proj_params.out_features,
+                key_proj_params.bias,
+            )
+        else:
+            logger.info(
+                "No Key projection parameters were passed, assuming that the weights"
+                + " are shared with the query projection"
+            )
+            self.k_proj = self.q_proj
+        if value_proj_params is not None:
+            self.v_proj = nn.Linear(
+                value_proj_params.in_features,
+                value_proj_params.out_features,
+                value_proj_params.bias,
+            )
+        else:
+            logger.info(
+                "No Value projection parameters were passed, assuming that the weights"
+                + " are shared with the query projection"
+            )
+            self.v_proj = self.q_proj
+        if not use_separate_proj_weight:
+            # Compute optimization used at times, share the parameters in between Q/K/V
+            with torch.no_grad():
+                self.k_proj.weight = self.q_proj.weight
+                self.v_proj.weight = self.q_proj.weight
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # One projection per input tensor
+        # NOTE: Would it make sense to catch self attention + shared weights, to skip a projection step ?
+        q, k, v = map(
+            lambda fn, x: fn(x),
+            [self.q_proj, self.k_proj, self.v_proj],
+            [query, key, value],
+        )
+        return q, k, v

.venv/lib/python3.11/site-packages/xformers/components/multi_head_dispatch.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from dataclasses import asdict, dataclass
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+from torch.nn.init import constant_
+from xformers._deprecation_warning import deprecated_function
+from xformers.components.attention import Attention
+from xformers.components.input_projection import InputProjection, InputProjectionConfig
+from xformers.components.positional_embedding import RotaryEmbedding
+logger = logging.getLogger("xformers")
+@dataclass
+class MultiHeadDispatchConfig:
+    dim_model: int
+    num_heads: int
+    attention: Attention
+    bias: bool
+    residual_dropout: float
+    dim_key: Optional[int]
+    dim_value: Optional[int]
+    in_proj_container: Optional[InputProjection]
+    use_separate_proj_weight: Optional[bool]
+    use_rotary_embeddings: Optional[bool]
+    out_proj: Optional[nn.Module]
+    def __getitem__(self, item):
+        return getattr(self, item)
+# Move head forward and fold into batch dim. dimensions become (B * nh, S, hs)
+def _fold_heads(t: torch.Tensor, B: int, S: int, H: int, Hs: int):
+    return t.view(B, S, H, Hs).transpose(1, 2).flatten(start_dim=0, end_dim=1)
+# Move head forward and fold into batch dim. dimensions become (B, nh, S, hs)
+def _split_heads(t: torch.Tensor, B: int, S: int, H: int, Hs: int):
+    return t.view(B, S, H, Hs).transpose(1, 2)
+class MultiHeadDispatch(nn.Module):
+    """
+    A multi-head masked self-attention dispatch mechanism, with a projection at the end,
+    following the architecture proposed in `Attention is all you need`_, Vaswani et al.
+    The actual attention mechanism can vary, as well as the projections.
+    This can be used to wrap the proposed attention mechanisms and make them multi-head aware,
+    but it is optional.
+    Args:
+        dim_model: The model/embedding dimension
+        num_heads: The number of heads being used
+        attention: The attention mechanism (needs to be registered to the xformers library)
+        bias: Whether to use bias for the projections : (Q, K, V, Output)
+        residual_dropout: Amount of dropout on the residual path
+        use_separate_proj_weight: Use different weights for the Q, K, V projections
+        dim_key: Optionally use a different dimension for the key
+        dim_value:  Optionally use a different dimension for the value
+        in_proj_container: Optionally provide the input projection module
+        use_rotary_embeddings: Use rotary embeddings
+        out_proj: Optionally provide the output projection module
+    .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762v5
+    """
+    def __init__(
+        self,
+        dim_model: int,
+        num_heads: int,
+        attention: Attention,
+        bias: Tuple[bool, bool, bool, bool] = (True, True, True, True),
+        residual_dropout: float = 0.0,
+        use_separate_proj_weight: bool = True,
+        dim_key: Optional[int] = None,
+        dim_value: Optional[int] = None,
+        in_proj_container: Optional[InputProjection] = None,
+        use_rotary_embeddings: Optional[bool] = False,
+        out_proj: Optional[nn.Module] = None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+        deprecated_function(self)
+        if isinstance(bias, bool):
+            logger.warning(
+                "Single bias value provided for the MHA projections."
+                + f" Assuming the same parameter ({bias}) is to be used everywhere"
+            )
+            bias = (bias, bias, bias, bias)
+        assert (
+            dim_model % num_heads == 0
+        )  # static preset for now, each head works on 1/d the embeddings, could be relaxed
+        assert num_heads > 0
+        # Popular default is that all latent dimensions are the same
+        dim_key, dim_value = map(lambda x: x if x else dim_model, (dim_key, dim_value))
+        self.num_heads = num_heads
+        self.dim_key_head = dim_key // num_heads
+        self.dim_value_head = dim_value // num_heads
+        self.dim_model = dim_model
+        self.attention = attention
+        # key, query, value projections for all heads
+        # critical options are
+        # - are we sharing weights ?
+        # - are we adding biases ?
+        if attention.requires_input_projection:
+            self.in_proj_container = (
+                in_proj_container
+                if in_proj_container is not None
+                else InputProjection(
+                    query_proj_params=InputProjectionConfig(
+                        dim_model, dim_key, bias=bias[0]
+                    ),
+                    key_proj_params=InputProjectionConfig(
+                        dim_model, dim_key, bias=bias[1]
+                    ),
+                    value_proj_params=InputProjectionConfig(
+                        dim_model, dim_value, bias=bias[2]
+                    ),
+                    use_separate_proj_weight=use_separate_proj_weight,
+                )
+            )
+        # Optional rotary embeddings
+        self.rotary_embeddings = (
+            RotaryEmbedding(self.dim_key_head) if use_rotary_embeddings else None
+        )
+        # Regularization
+        self.resid_drop = nn.Dropout(residual_dropout, inplace=False)
+        # Output projection
+        self.proj = (
+            out_proj if out_proj else nn.Linear(dim_model, dim_model, bias=bias[3])
+        )
+        if isinstance(self.proj, nn.Linear) and self.proj.bias is not None:
+            constant_(self.proj.bias, 0.0)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        value: Optional[torch.Tensor] = None,
+        att_mask: Optional[torch.Tensor] = None,
+        key_padding_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Expected input dimensions are [batch size, sequence length, embed dim]
+        Output dimensions are [batch size, sequence length, embed dim]
+        """
+        if key is None:
+            key = query
+        if value is None:
+            value = query
+        if query.shape[0] != key.shape[0] or query.shape[0] != value.shape[0]:
+            max_batch = max((query.shape[0], key.shape[0], value.shape[0]))
+            query, key, value = map(
+                lambda x: x.expand(max_batch, -1, -1), [query, key, value]
+            )
+        B, S_Q, _ = query.size()  # Batch x Sequence x Embedding (latent)
+        _, S_K, _ = key.size()  # K, Q's sequence length could differ
+        # Catch different query and key length but a causal attention
+        if S_Q != S_K:
+            assert (
+                not self.attention.requires_same_k_q_dimensions
+            ), "This attention mechanism requires query and key to have the same sequence (context) lengths"
+            if hasattr(self.attention, "causal"):
+                assert not self.attention.causal, (
+                    "Causal attention is not supported when key and query have different sequence lengths.\n"
+                    + "In that case causality is ill-determined. Please pad your sequences accordingly"
+                )
+        kw_mask_args = {}
+        if att_mask is not None:
+            assert (
+                self.attention.supports_attention_mask
+            ), "This attention does not support attention masks"
+            kw_mask_args["att_mask"] = att_mask
+        if key_padding_mask is not None:
+            assert (
+                self.attention.supports_key_padding_mask
+            ), "This attention does not support key padding masks"
+            kw_mask_args["key_padding_mask"] = key_padding_mask
+        if self.attention.requires_skip_multi_head:
+            return self.attention(query, key, value, **kw_mask_args)
+        # Calculate query, key, values for all heads in batch
+        if self.attention.requires_input_projection:
+            q, k, v = self.in_proj_container(query=query, key=key, value=value)
+        else:
+            k, q, v = key, query, value
+        # Check the dimensions properly
+        def check(t, name):
+            assert (
+                t.shape[2] % self.num_heads == 0
+            ), f"the {name} embeddings need to be divisible by the number of heads"
+        check(q, "projected query")
+        check(v, "projected value")
+        check(k, "projected key")
+        # Optional: rotary embedding, add relative positioning information
+        if self.rotary_embeddings:
+            # rotary requires the head dimension
+            q = _split_heads(q, B, S_Q, self.num_heads, self.dim_key_head)
+            k = _split_heads(k, B, S_K, self.num_heads, self.dim_key_head)
+            v = _split_heads(v, B, S_K, self.num_heads, self.dim_value_head)
+            q, k = self.rotary_embeddings(q=q, k=k)
+            if not self.attention.requires_head_dimension:
+                q, k, v = q.flatten(0, 1), k.flatten(0, 1), v.flatten(0, 1)
+        else:
+            # Reshape k/q/v to either expose the heads, or fold the head dimension into the batch
+            reshape_fn = (
+                _split_heads if self.attention.requires_head_dimension else _fold_heads
+            )
+            q = reshape_fn(q, B, S_Q, self.num_heads, self.dim_key_head)
+            k = reshape_fn(k, B, S_K, self.num_heads, self.dim_key_head)
+            v = reshape_fn(v, B, S_K, self.num_heads, self.dim_value_head)
+        # Self-attend
+        y = self.attention(q, k, v, **kw_mask_args)
+        # Re-assemble all head outputs side by side
+        y = (
+            y.view(B, self.num_heads, S_Q, self.dim_value_head)
+            .transpose(1, 2)
+            .flatten(start_dim=2, end_dim=3)
+        )
+        # Output projection, dropout and good to go
+        y = self.resid_drop(self.proj(y))
+        # Return the same sequence size as the input
+        return y
+    @classmethod
+    def from_config(cls, config: MultiHeadDispatchConfig):
+        # Generate the class inputs from the config
+        fields = asdict(config)
+        # Skip all Nones so that default values are used
+        fields = {k: v for k, v in fields.items() if v is not None}
+        return cls(**fields)

.venv/lib/python3.11/site-packages/xformers/components/patch_embedding.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from dataclasses import dataclass
+from enum import Enum
+import torch
+from xformers._deprecation_warning import deprecated_function
+class PoolType(str, Enum):
+    Conv2D = "CONV_2D"
+    # ...
+    # TODO: Support more cases ?
+@dataclass
+class PatchEmbeddingConfig:
+    """
+    The configuration for the patch embedding layer, which takes the raw token passed in
+    and returns a pooled representation along a given embedding dimension.
+    This typically trades the spatial (context length) representation with the embedding size
+    This is canonicaly used by ViT, but other papers (like MetaFormer or other hierarchical transformers)
+    propose a more general use case for this
+    """
+    in_channels: int
+    out_channels: int
+    kernel_size: int
+    stride: int
+    padding: int = 0
+    pool_type: PoolType = PoolType.Conv2D
+class ConditionalReshape(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        deprecated_function(self)
+    def forward(self, x):
+        if x.ndim == 3:
+            B, HW, C = x.shape
+            # NOTE: We're assuming a square sample here
+            H = int(math.sqrt(HW))
+            assert H * H == HW, f"{H, HW}"
+            x = x.transpose(1, 2).reshape(B, C, H, H)
+        return x
+class PatchToSequence(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        deprecated_function(self)
+    def forward(self, x):
+        return x.flatten(2, 3).transpose(1, 2).contiguous()  # B HW C
+def build_patch_embedding(config: PatchEmbeddingConfig):
+    if not isinstance(config, PatchEmbeddingConfig):
+        config = PatchEmbeddingConfig(**config)
+    if config.pool_type == PoolType.Conv2D:
+        pool = torch.nn.Conv2d(
+            config.in_channels,
+            config.out_channels,
+            kernel_size=config.kernel_size,
+            stride=config.stride,
+            padding=config.padding,
+        )
+    else:
+        raise NotImplementedError
+    # The patch embedding supposes that the input really is 2D in essence
+    # If this block is in the middle of a stack, we need to reshape
+    return torch.nn.Sequential(ConditionalReshape(), pool, PatchToSequence())

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__init__.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from pathlib import Path
+from typing import Any, Callable, Dict, Set, Union
+from xformers.utils import (
+    generate_matching_config,
+    get_registry_decorator,
+    import_all_modules,
+)
+from .base import PositionEmbedding, PositionEmbeddingConfig  # noqa
+# CREDITS: Classy Vision registry mechanism
+POSITION_EMBEDDING_REGISTRY: Dict[str, Any] = {}
+POSITION_EMBEDDING_CLASS_NAMES: Set[str] = set()
+def build_positional_embedding(config: Union[Dict[str, Any], PositionEmbeddingConfig]):
+    """Builds a position encoding from a config.
+    This assumes a 'name' key in the config which is used to determine what
+    attention class to instantiate. For instance, a config `{"name": "my_position_encoding",
+    "foo": "bar"}` will find a class that was registered as "my_position_encoding"
+    (see :func:`register_positional_embedding`) and call .from_config on it."""
+    if not isinstance(config, PositionEmbeddingConfig):
+        config_instance = generate_matching_config(
+            config, POSITION_EMBEDDING_REGISTRY[config["name"]].config
+        )
+    else:
+        config_instance = config
+    return POSITION_EMBEDDING_REGISTRY[config_instance.name].constructor.from_config(
+        config_instance
+    )
+"""Registers a PositionEncoding subclass.
+    This decorator allows xFormers to instantiate a subclass of PositionEncoding
+    from a configuration file, even if the class itself is not part of the
+    xFormers framework. To use it, apply this decorator to a `PositionEncoding`
+    subclass, like this:
+    .. code-block:: python
+        @dataclass
+        class MyConfig:
+            ...
+        @register_positional_embedding('my_encoding', MyConfig)
+        class MyEncoding(PositionEncoding):
+            ...
+    To instantiate a position encoding from a configuration file, see :func:`build_positional_embedding`."""
+register_positional_embedding: Callable[
+    [str, Any], Callable[[Any], Any]
+] = get_registry_decorator(
+    POSITION_EMBEDDING_REGISTRY,
+    POSITION_EMBEDDING_CLASS_NAMES,
+    PositionEmbedding,
+    PositionEmbeddingConfig,
+)
+from .rotary import RotaryEmbedding  # noqa
+from .sine import SinePositionalEmbedding  # type: ignore  # noqa
+from .vocab import VocabEmbedding  # noqa
+__all__ = [
+    "RotaryEmbedding",
+    "SinePositionalEmbedding",
+    "VocabEmbedding",
+    "build_positional_embedding",
+    "register_positional_embedding",
+]
+# automatically import any Python files in the directory
+import_all_modules(
+    str(Path(__file__).parent), "xformers.components.positional_embedding"
+)

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.53 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/base.cpython-311.pyc ADDED Viewed

Binary file (2.38 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/param.cpython-311.pyc ADDED Viewed

Binary file (2.87 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/rotary.cpython-311.pyc ADDED Viewed

Binary file (4.85 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/sine.cpython-311.pyc ADDED Viewed

Binary file (2.67 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/__pycache__/vocab.cpython-311.pyc ADDED Viewed

Binary file (3.52 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/base.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import ABCMeta, abstractmethod
+from dataclasses import asdict, dataclass
+from typing import Type, TypeVar
+import torch.nn as nn
+from xformers._deprecation_warning import deprecated_function
+Self = TypeVar("Self", bound="PositionEmbedding")
+@dataclass
+class PositionEmbeddingConfig:
+    name: str
+    dim_model: int
+    seq_len: int
+class PositionEmbedding(nn.Module, metaclass=ABCMeta):
+    @abstractmethod
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__()
+        deprecated_function(self)
+    @classmethod
+    def from_config(cls: Type[Self], config: PositionEmbeddingConfig) -> Self:
+        # Generate the class inputs from the config
+        fields = asdict(config)
+        # Skip all Nones so that default values are used
+        fields = {k: v for k, v in fields.items() if v is not None}
+        return cls(**fields)

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/param.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+import torch
+from xformers.components.positional_embedding import (
+    PositionEmbedding,
+    PositionEmbeddingConfig,
+    register_positional_embedding,
+)
+@dataclass
+class LearnablePositionalEmbeddingConfig(PositionEmbeddingConfig):
+    name: str
+    seq_len: int
+    dim_model: int
+    add_class_token: bool
+@register_positional_embedding("learnable", LearnablePositionalEmbeddingConfig)
+class LearnablePositionalEmbedding(PositionEmbedding):
+    def __init__(
+        self, seq_len: int, dim_model: int, add_class_token: bool = False, *_, **__
+    ):
+        super().__init__()
+        # 0.02 is BERT initialization
+        self.pos_emb = torch.nn.Parameter(
+            torch.randn(1, seq_len + int(add_class_token), dim_model) * 0.02
+        )
+        self.class_token = (
+            torch.nn.Parameter(torch.zeros(dim_model)) if add_class_token else None
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.class_token is not None:
+            # Prepend class token
+            clf_token = (
+                torch.ones(x.shape[0], 1, self.pos_emb.shape[-1], device=x.device)
+                * self.class_token
+            )
+            x = torch.cat([clf_token, x], dim=1)
+        if x.ndim == 2:
+            x = x.unsqueeze(-1)
+        return x + self.pos_emb

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/rotary.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+# CREDITS: This implementation is inspired by GPT-NeoX https://github.com/EleutherAI/gpt-neox
+# NOTE: Almost the same right now, moving parts to Triton is the next step
+from typing import Tuple
+import torch
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+@torch.jit.script
+def apply_rotary_pos_emb(x, cos, sin):
+    # NOTE: This could probably be moved to Triton
+    # Handle a possible sequence length mismatch in between q and k
+    cos = cos[:, :, : x.shape[-2], :]
+    sin = sin[:, :, : x.shape[-2], :]
+    return (x * cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
+        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
+    """
+    def __init__(self, dim_model: int, *_, **__):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim_model, 2).float() / dim_model))
+        self.register_buffer("inv_freq", inv_freq)
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+    def _update_cos_sin_tables(self, x, seq_dimension=1):
+        seq_len = x.shape[seq_dimension]
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seq_len != self._seq_len_cached
+            or self._cos_cached.device != x.device
+            or self._cos_cached.dtype != x.dtype
+        ):
+            self._seq_len_cached = seq_len
+            t = torch.arange(
+                x.shape[seq_dimension], device=x.device, dtype=torch.float32
+            )
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(x.dtype))
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self._cos_cached = emb.cos()[None, None, :, :].to(x.dtype)
+            self._sin_cached = emb.sin()[None, None, :, :].to(x.dtype)
+        return self._cos_cached, self._sin_cached
+    def forward(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
+            k, seq_dimension=-2
+        )
+        return (
+            apply_rotary_pos_emb(q, self._cos_cached, self._sin_cached),
+            apply_rotary_pos_emb(k, self._cos_cached, self._sin_cached),
+        )

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/sine.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+# Silence Mypy errors in this file.
+# type: ignore
+import math
+import torch
+from xformers.components.positional_embedding import (
+    PositionEmbedding,
+    PositionEmbeddingConfig,
+    register_positional_embedding,
+)
+@register_positional_embedding("sine", PositionEmbeddingConfig)
+class SinePositionalEmbedding(PositionEmbedding):
+    def __init__(self, dim_model: int, *args, **kwargs):
+        super().__init__()
+        self.dim_model = dim_model
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        seq_len = x.shape[1]
+        pos = (
+            torch.arange(0, seq_len, device=x.device, dtype=torch.float32)
+            .unsqueeze(1)
+            .repeat(1, self.dim_model)
+        )
+        dim = (
+            torch.arange(0, self.dim_model, device=x.device, dtype=torch.float32)
+            .unsqueeze(0)
+            .repeat(seq_len, 1)
+        )
+        div = torch.exp(-math.log(10000) * (2 * (dim // 2) / self.dim_model))
+        pos *= div
+        pos[:, 0::2] = torch.sin(pos[:, 0::2])
+        pos[:, 1::2] = torch.cos(pos[:, 1::2])
+        output = x.unsqueeze(-1) if x.ndim == 2 else x
+        return output + pos.unsqueeze(0)

.venv/lib/python3.11/site-packages/xformers/components/positional_embedding/vocab.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from xformers.components.positional_embedding import (
+    PositionEmbedding,
+    PositionEmbeddingConfig,
+    register_positional_embedding,
+)
+@dataclass
+class VocabEmbeddingConfig(PositionEmbeddingConfig):
+    vocab_size: int
+    dropout: float
+@register_positional_embedding("vocab", VocabEmbeddingConfig)
+class VocabEmbedding(PositionEmbedding):
+    def __init__(
+        self,
+        dim_model: int,
+        seq_len: int,
+        vocab_size: int,
+        dropout: float = 0.0,
+        *args,
+        **kwargs
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.dim_model = dim_model
+        self.dropout = torch.nn.Dropout(p=dropout)
+        self.position_embeddings = nn.Embedding(seq_len, self.dim_model)
+        self.word_embeddings = nn.Embedding(self.vocab_size, self.dim_model)
+        self.position_ids: Optional[torch.Tensor] = None
+        self.init_weights()
+    def init_weights(self, gain: float = 1.0):
+        torch.nn.init.normal_(self.position_embeddings.weight, std=0.02 * gain)
+        torch.nn.init.normal_(self.word_embeddings.weight, std=0.02 * gain)
+    def forward(self, x: torch.Tensor):
+        position_ids = torch.arange(x.shape[1], dtype=torch.long, device=x.device)[
+            None, :
+        ].repeat(x.shape[0], 1)
+        X_token = self.word_embeddings(x)
+        X_pos = self.position_embeddings(position_ids)
+        X = X_token + X_pos
+        X = self.dropout(X)
+        return X

.venv/lib/python3.11/site-packages/xformers/components/residual.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import namedtuple
+from enum import Enum
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+from xformers._deprecation_warning import deprecated_function
+class ResidualNormStyle(str, Enum):
+    """Support different residual path and norm styles.
+    See "On Layer Normalization in the Transformer Architecture",
+    Xiong et al., https://arxiv.org/pdf/2002.04745v1.pdf
+    """
+    Pre = "pre"
+    Post = "post"
+    DeepNorm = "deepnorm"
+class NormalizationType(str, Enum):
+    LayerNorm = "layernorm"
+    Skip = "skip"
+    # TODO: BatchNorm = "batchnorm"
+    # TODO: GroupNorm = "groupnorm"
+def get_normalization_layer(normalization_type: NormalizationType):
+    class Skip(nn.Module):
+        def __init__(self, *_, **__) -> None:
+            super().__init__()
+            deprecated_function(self)
+        def forward(self, x: torch.Tensor, **_):
+            return x
+    return {
+        NormalizationType.LayerNorm: nn.LayerNorm,
+        NormalizationType.Skip: Skip,
+    }[normalization_type]
+class RequiresWrappedInputs:
+    """Used to mark, through inheritance,
+    the fact that this class will require inputs to be passed as a single list"""
+    pass
+# CREDITS: the following is inspired by FastAI's Transformer implementation
+class Residual(nn.Module, RequiresWrappedInputs):
+    """
+    Object-oriented handling of the residual path
+    This supports scaling of the residual path, as proposed by DeepNet_
+    .. _DeepNet: https://arxiv.org/pdf/2203.00555v1.pdf
+    .. Note: the wrapped layers must accept all the inputs as a single list
+    """
+    def __init__(self, layer: nn.Module, scale: Optional[float] = None):
+        super().__init__()
+        deprecated_function(self)
+        self.layer = layer
+        self.scale = scale
+        # PreNorm and PostNorm require all the tensors to be passed as a list
+        self.wrap_inputs = isinstance(layer, RequiresWrappedInputs)
+    def forward(self, inputs: List[torch.Tensor], **kwargs):
+        if self.scale is not None:
+            residue = inputs[0] * self.scale
+        else:
+            residue = inputs[0]
+        if self.wrap_inputs:
+            return residue + self.layer(inputs=inputs, **kwargs)
+        else:
+            return residue + self.layer(*inputs, **kwargs)
+class PreNorm(nn.Module, RequiresWrappedInputs):
+    """Adds a normalization before computing attention
+    ..Note: If a list of inputs is passed, all of them get normalized"""
+    def __init__(
+        self,
+        d_norm: int,
+        sublayer: nn.Module,
+        normalization: NormalizationType,
+        use_triton: bool = True,
+    ):
+        super().__init__()
+        deprecated_function(self)
+        self.norm = get_normalization_layer(normalization)(d_norm)
+        self.sublayer = sublayer
+        self.wrap_inputs = isinstance(sublayer, RequiresWrappedInputs)
+    def forward(self, inputs: List[torch.Tensor], **kwargs):
+        assert len(inputs) > 0
+        # Perf improvement: if the inputs are all the same, only norm once
+        ids = [id(x) for x in inputs]
+        if ids.count(ids[0]) == len(ids):
+            # The same tensor is passed multiple times
+            x_norm = self.norm(inputs[0])
+            inputs_normed = [x_norm for _ in inputs]
+        else:
+            # The inputs differ, norm them all
+            inputs_normed = [self.norm(x_) for x_ in inputs]
+        if self.wrap_inputs:
+            return self.sublayer(inputs=inputs_normed, **kwargs)
+        else:
+            return self.sublayer(*inputs_normed, **kwargs)
+class PostNorm(nn.Module, RequiresWrappedInputs):
+    """Adds LayerNorm after computing attention"""
+    def __init__(
+        self,
+        d_norm: int,
+        sublayer: nn.Module,
+        normalization: NormalizationType,
+        use_triton: bool = True,
+    ):
+        super().__init__()
+        deprecated_function(self)
+        self.norm = get_normalization_layer(normalization)(d_norm)
+        self.sublayer = sublayer
+        self.wrap_inputs = isinstance(sublayer, RequiresWrappedInputs)
+    def forward(self, inputs: List[torch.Tensor], **kwargs):
+        if self.wrap_inputs:
+            x = self.sublayer(inputs=inputs, **kwargs)
+        else:
+            x = self.sublayer(*inputs, **kwargs)
+        return self.norm(x)
+DeepNormCoefficients = namedtuple("DeepNormCoefficients", ["alpha", "beta"])
+def get_deepnorm_coefficients(
+    encoder_layers: int, decoder_layers: int
+) -> Tuple[Optional[DeepNormCoefficients], Optional[DeepNormCoefficients]]:
+    """
+    See DeepNet_.
+    Returns alpha and beta depending on the number of encoder and decoder layers,
+    first tuple is for the encoder and second for the decoder
+    .. _DeepNet: https://arxiv.org/pdf/2203.00555v1.pdf
+    """
+    N = encoder_layers
+    M = decoder_layers
+    if decoder_layers == 0:
+        # Encoder only
+        return (
+            DeepNormCoefficients(alpha=(2 * N) ** 0.25, beta=(8 * N) ** -0.25),
+            None,
+        )
+    elif encoder_layers == 0:
+        # Decoder only
+        return None, DeepNormCoefficients(alpha=(2 * M) ** 0.25, beta=(8 * M) ** -0.25)
+    else:
+        # Encoder/decoder
+        encoder_coeffs = DeepNormCoefficients(
+            alpha=0.81 * ((N**4) * M) ** 0.0625, beta=0.87 * ((N**4) * M) ** -0.0625
+        )
+        decoder_coeffs = DeepNormCoefficients(
+            alpha=(3 * M) ** 0.25, beta=(12 * M) ** -0.25
+        )
+        return (encoder_coeffs, decoder_coeffs)

.venv/lib/python3.11/site-packages/xformers/components/reversible.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List
+import torch
+import torch.nn as nn
+from torch.autograd.function import Function
+from torch.utils.checkpoint import get_device_states, set_device_states
+from xformers._deprecation_warning import deprecated_function
+from xformers.components import RequiresWrappedInputs
+# CREDITS: Code adapted from
+# https://github.com/lucidrains/reformer-pytorch/blob/master/reformer_pytorch/reversible.py
+# https://github.com/RobinBruegger/RevTorch/blob/master/revtorch/revtorch.py,
+# https://pytorch.org/docs/stable/_modules/torch/utils/checkpoint.html
+# pyre-fixme[13]: `cpu_state` is not initialized in the constructor.
+class Deterministic(nn.Module):
+    def __init__(self, net: nn.Module):
+        super().__init__()
+        deprecated_function(self)
+        self.net = net
+        self.cpu_state: torch.Tensor = torch.get_rng_state()
+        self.cuda_in_fwd: bool = False
+        self.gpu_devices: List[int] = []
+        self.gpu_states: List[torch.Tensor] = []
+        self.wrap_inputs = isinstance(net, RequiresWrappedInputs)
+    def record_rng(self, *args):
+        self.cpu_state = torch.get_rng_state()
+        if torch.cuda._initialized:
+            self.cuda_in_fwd = True
+            self.gpu_devices, self.gpu_states = get_device_states(*args)
+    def forward(self, *args, record_rng: bool = False, set_rng: bool = False, **kwargs):
+        if record_rng:
+            self.record_rng(*args)
+        if not set_rng:
+            # Normal FW run
+            if self.wrap_inputs:
+                return self.net(inputs=args, **kwargs)
+            else:
+                return self.net(*args, **kwargs)
+        else:  # pragma: no cover  # this is called in the backward pass, not picked up
+            # This is analogous to checkpointing, reset the original random state
+            rng_devices: List[int] = []
+            if self.cuda_in_fwd:
+                rng_devices = self.gpu_devices
+            with torch.random.fork_rng(devices=rng_devices, enabled=True):
+                torch.set_rng_state(self.cpu_state)
+                if self.cuda_in_fwd:
+                    set_device_states(self.gpu_devices, self.gpu_states)
+                if self.wrap_inputs:
+                    return self.net(inputs=args, **kwargs)
+                else:
+                    return self.net(*args, **kwargs)
+class ReversibleBlock(nn.Module):
+    def __init__(self, f: nn.Module, g: nn.Module, split_dim: int = -1):
+        super().__init__()
+        self.f = Deterministic(f)
+        self.g = Deterministic(g)
+        self.split_dim = split_dim
+    def forward(self, x: torch.Tensor, f_args={}, g_args={}):
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+        y1, y2 = None, None
+        with torch.no_grad():
+            y1 = x1 + self.f(x2, record_rng=self.training, **f_args)
+            y2 = x2 + self.g(y1, record_rng=self.training, **g_args)
+        return torch.cat([y1, y2], dim=self.split_dim)
+    def backward_pass(
+        self, y: torch.Tensor, dy: torch.Tensor, f_args={}, g_args={}
+    ):  # pragma: no cover  # this is covered, but called directly from C++
+        y1, y2 = torch.chunk(y, 2, dim=self.split_dim)
+        del y
+        dy1, dy2 = torch.chunk(dy, 2, dim=self.split_dim)
+        del dy
+        with torch.enable_grad():
+            y1.requires_grad = True
+            gy1 = self.g(y1, set_rng=True, **g_args)
+            torch.autograd.backward(gy1, dy2)
+        with torch.no_grad():
+            x2 = y2 - gy1
+            del y2, gy1
+            dx1 = dy1 + y1.grad
+            del dy1
+            y1.grad = None
+        with torch.enable_grad():
+            x2.requires_grad = True
+            fx2 = self.f(x2, set_rng=True, **f_args)
+            torch.autograd.backward(fx2, dx1)
+        with torch.no_grad():
+            x1 = y1 - fx2
+            del y1, fx2
+            dx2 = dy2 + x2.grad
+            del dy2
+            x2.grad = None
+            x = torch.cat([x1, x2.detach()], dim=self.split_dim)
+            dx = torch.cat([dx1, dx2], dim=self.split_dim)
+        return x, dx
+class _ReversibleFunction(Function):
+    @staticmethod
+    def forward(ctx, x, blocks, kwargs):
+        ctx.kwargs = kwargs
+        for block in blocks:
+            x = block(x, **kwargs)
+        ctx.y = x.detach()
+        ctx.blocks = blocks
+        return x
+    @staticmethod
+    def backward(
+        ctx, dy
+    ):  # pragma: no cover # this is covered, but called directly from C++
+        y = ctx.y
+        kwargs = ctx.kwargs
+        for block in ctx.blocks[::-1]:
+            y, dy = block.backward_pass(y, dy, **kwargs)
+        return dy, None, None
+class ReversibleSequence(nn.Module):
+    def __init__(self, blocks: nn.ModuleList):
+        super().__init__()
+        deprecated_function(self)
+        # pyre-fixme[23]: Unable to unpack `torch.nn.Module` into 2 values.
+        self.blocks = nn.ModuleList([ReversibleBlock(f, g) for f, g in blocks])
+    def forward(self, x, arg_route=(True, False), **kwargs):
+        f_args, g_args = map(lambda route: kwargs if route else {}, arg_route)
+        block_kwargs = {"f_args": f_args, "g_args": g_args}
+        return _ReversibleFunction.apply(x, self.blocks, block_kwargs)

.venv/lib/python3.11/site-packages/xformers/components/simplicial_embedding.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import asdict, dataclass
+from typing import Optional, Type, TypeVar
+import torch
+from xformers._deprecation_warning import deprecated_function
+Self = TypeVar("Self", bound="SimplicialEmbedding")
+@dataclass
+class SimplicialEmbeddingConfig:
+    L: int
+    temperature: float
+class SimplicialEmbedding(torch.nn.Module):
+    """
+    An implementation of the "Simplicial Embeddings"_, as proposed by Lavoie et. al
+    Arguments:
+        - L: the number of embedding chunks
+        - temperature: optional scaling parameter for the softmax operation.
+            A small (<1.) temperature will lead to a sparse representation (up to one-hot),
+            while a large (>1.) temperature will make the vector more uniform
+    _"Simplicial Embeddings": https://arxiv.org/pdf/2204.00616.pdf
+    """
+    def __init__(self, L: int, temperature: Optional[float] = None) -> None:
+        super().__init__()
+        deprecated_function(self)
+        self.L = L
+        self.temperature = temperature
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert (
+            x.shape[-1] % self.L == 0
+        ), f"The embedding dimension {x.shape[-1]} is not divisible by the chosen L parameter {self.L}"
+        # Separate the input tensor into V chunks
+        B, C, E = x.shape
+        V = E // self.L
+        Vs = x.reshape(B, C, self.L, V)
+        # Softmax normalize them, with the proposed temperature
+        # This is done over the last dimension, so only within Vs
+        if self.temperature is not None:
+            Vs /= self.temperature
+        Vs = torch.nn.functional.softmax(Vs, dim=-1)
+        # Concatenate back and return
+        return Vs.reshape(B, C, E)
+    @classmethod
+    def from_config(cls: Type[Self], config: SimplicialEmbeddingConfig) -> Self:
+        # Generate the class inputs from the config
+        fields = asdict(config)
+        return cls(**fields)

.venv/lib/python3.11/site-packages/xformers/ops/__init__.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from .fmha import (
+    AttentionBias,
+    AttentionOp,
+    AttentionOpBase,
+    LowerTriangularMask,
+    MemoryEfficientAttentionCkOp,
+    MemoryEfficientAttentionCutlassFwdFlashBwOp,
+    MemoryEfficientAttentionCutlassOp,
+    MemoryEfficientAttentionFlashAttentionOp,
+    MemoryEfficientAttentionSplitKCkOp,
+    memory_efficient_attention,
+    memory_efficient_attention_backward,
+    memory_efficient_attention_forward,
+    memory_efficient_attention_forward_requires_grad,
+)
+from .indexing import index_select_cat, scaled_index_add
+from .ipc import init_ipc
+from .modpar_layers import ColumnParallelLinear, RowParallelLinear
+from .rmsnorm import RMSNorm
+from .rope_padded import rope_padded
+from .seqpar import sequence_parallel_leading_matmul, sequence_parallel_trailing_matmul
+from .sequence_parallel_fused_ops import (
+    fused_allgather_and_anything,
+    fused_allgather_and_linear,
+    fused_anything_and_reducescatter,
+    fused_linear_and_reducescatter,
+)
+from .sp24 import Sparse24Tensor, sparsify24, sparsify24_like
+from .swiglu_op import (
+    SwiGLU,
+    SwiGLUEagerOp,
+    SwiGLUFusedOp,
+    SwiGLUOp,
+    SwiGLUOpDispatch,
+    SwiGLUPackedFusedOp,
+    swiglu,
+)
+from .tiled_matmul import tiled_matmul
+from .unbind import get_stack_strides, stack_or_none, unbind
+# BW compatibility
+AttentionMask = AttentionBias
+def masked_matmul(a, b, mask=None):
+    if torch.overrides.has_torch_function((a, b, mask)):
+        return torch.overrides.handle_torch_function(
+            masked_matmul, (a, b, mask), a, b, mask
+        )
+    att = a @ b
+    if mask is None:
+        return att
+    if mask.dtype == torch.bool:
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).expand(att.shape[0], -1, -1)
+        # mask is presumed false == ignore
+        att[~mask] = float("-inf")
+    else:
+        # mask is presumed additive
+        att += mask
+    return att
+__all__ = [
+    # fmha
+    "AttentionBias",
+    "AttentionMask",
+    "AttentionOp",
+    "AttentionOpBase",
+    "LowerTriangularMask",
+    "MemoryEfficientAttentionCutlassFwdFlashBwOp",
+    "MemoryEfficientAttentionCutlassOp",
+    "MemoryEfficientAttentionFlashAttentionOp",
+    "MemoryEfficientAttentionCkOp",
+    "MemoryEfficientAttentionSplitKCkOp",
+    "memory_efficient_attention",
+    "memory_efficient_attention_backward",
+    "memory_efficient_attention_forward",
+    "memory_efficient_attention_forward_requires_grad",
+    # indexing
+    "index_select_cat",
+    "scaled_index_add",
+    # ipc
+    "init_ipc",
+    # modpar_layers
+    "ColumnParallelLinear",
+    "RowParallelLinear",
+    # rmsnorm
+    "RMSNorm",
+    # rope_padded
+    "rope_padded",
+    # seqpar
+    "sequence_parallel_leading_matmul",
+    "sequence_parallel_trailing_matmul",
+    # sequence_parallel_fused_ops
+    "fused_allgather_and_anything",
+    "fused_allgather_and_linear",
+    "fused_anything_and_reducescatter",
+    "fused_linear_and_reducescatter",
+    # swiglu_op
+    "SwiGLU",
+    "SwiGLUEagerOp",
+    "SwiGLUFusedOp",
+    "SwiGLUOp",
+    "SwiGLUOpDispatch",
+    "SwiGLUPackedFusedOp",
+    "swiglu",
+    # tiled_matmul
+    "tiled_matmul",
+    # unbind
+    "get_stack_strides",
+    "stack_or_none",
+    "unbind",
+    # sp24
+    "sparsify24",
+    "sparsify24_like",
+    "Sparse24Tensor",
+    # .
+    "masked_matmul",
+]

.venv/lib/python3.11/site-packages/xformers/ops/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (3.34 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/_triton/k_index_select_cat.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def index_select_cat_fwd_kernel(
+    output_ptr,  # *Pointer* to output tensor.
+    source_ptr,  # *Pointer* to source tensor.
+    index_ptr,  # *Pointer* to index tensor.
+    num_indices,
+    num_cols,
+    stride0,  # Stride information of source tensor.
+    stride1,
+    BLOCK_SIZE_INDEX: tl.constexpr,  # Number of indices each program should process.
+    BLOCK_SIZE_COL: tl.constexpr,  # Number of cols each program should process.
+):
+    pid0 = tl.program_id(axis=0)  # We use 2D launch grid
+    pid1 = tl.program_id(axis=1)
+    indices = pid0 * BLOCK_SIZE_INDEX + tl.arange(0, BLOCK_SIZE_INDEX)
+    rows = tl.load(index_ptr + indices, mask=(indices < num_indices))
+    cols = pid1 * BLOCK_SIZE_COL + tl.arange(0, BLOCK_SIZE_COL)
+    source_offsets = source_ptr + rows[:, None] * stride0 + cols[None, :] * stride1
+    mask = (indices[:, None] < num_indices) & (cols[None, :] < num_cols)
+    output = tl.load(source_offsets, mask=mask)
+    output_offsets = output_ptr + indices[:, None] * stride0 + cols[None, :] * stride1
+    tl.store(output_offsets, output, mask=mask)
+def index_select_cat_fwd(
+    output: torch.Tensor,
+    source: torch.Tensor,
+    index: torch.Tensor,
+):
+    if not (source.is_cuda and index.is_cuda):
+        raise ValueError("The index tensor and the source tensor must be of type CUDA!")
+    if not source.ndim == 2:
+        raise ValueError(f"Expected 2-dimensional tensor, got {source.ndim}.")
+    if not index.ndim == 1:
+        raise ValueError(f"Expected 1-dimensional tensor, got {index.ndim}.")
+    num_rows, num_cols = source.shape
+    num_indices = index.shape[0]
+    if not num_indices < num_rows:
+        raise ValueError(
+            "The number of indices cannot exceed the number of rows in the source matrix."
+        )
+    stride0, stride1 = source.stride(0), source.stride(1)
+    def grid(meta):
+        return (
+            triton.cdiv(num_indices, meta["BLOCK_SIZE_INDEX"]),
+            triton.cdiv(num_cols, meta["BLOCK_SIZE_COL"]),
+        )
+    index_select_cat_fwd_kernel[grid](
+        output,
+        source,
+        index,
+        num_indices,
+        num_cols,
+        stride0,
+        stride1,
+        BLOCK_SIZE_INDEX=1,
+        BLOCK_SIZE_COL=512,
+    )
+    return output
+@triton.jit
+def index_select_cat_bwd_kernel(
+    grad_source_ptr,  # *Pointer* to grad_source tensor.
+    index_ptr,  # *Pointer* to index tensor.
+    grad_output_ptr,  # *Pointer* to grad_output tensor.
+    num_rows,
+    num_indices,
+    num_cols,
+    stride0,  # Stride information of input and source tensor.
+    stride1,
+    BLOCK_SIZE_INDEX: tl.constexpr,  # Number of indices each program should process.
+    BLOCK_SIZE_COL: tl.constexpr,  # Number of cols each program should process.
+):
+    pid0 = tl.program_id(axis=0)  # We use 3D launch grid
+    pid1 = tl.program_id(axis=1)
+    cols = pid1 * BLOCK_SIZE_COL + tl.arange(0, BLOCK_SIZE_COL)
+    # load grad_output
+    grad_output_indices = pid0 * BLOCK_SIZE_INDEX + tl.arange(0, BLOCK_SIZE_INDEX)
+    grad_output_offsets = (
+        grad_output_ptr
+        + grad_output_indices[:, None] * stride0
+        + cols[None, :] * stride1
+    )
+    grad_output_mask = (grad_output_indices[:, None] < num_indices) & (
+        cols[None, :] < num_cols
+    )
+    grad_output = tl.load(grad_output_offsets, mask=grad_output_mask).to(tl.float32)
+    # select indices from grad_source
+    grad_source_indices = tl.load(
+        index_ptr + grad_output_indices, mask=(grad_output_indices < num_indices)
+    )
+    grad_source_offsets = (
+        grad_source_ptr
+        + grad_source_indices[:, None] * stride0
+        + cols[None, :] * stride1
+    )
+    # compute scaled index add and save
+    tl.store(grad_source_offsets, grad_output, mask=grad_output_mask)
+def index_select_cat_bwd(
+    grad_source: torch.Tensor,
+    index: torch.Tensor,
+    grad_output: torch.Tensor,
+):
+    if not (grad_source.is_cuda and grad_output.is_cuda):
+        raise ValueError("The grad_source and grad_output tensor must be of type CUDA!")
+    if not (grad_source.ndim == 2 and grad_output.ndim == 2):
+        raise ValueError(
+            f"The grad_source and grad_output must be three-dimensional "
+            f"(got {grad_source.ndim} and {grad_output.ndim})!"
+        )
+    if not grad_source.shape[1] == grad_output.shape[1]:
+        raise ValueError(
+            f"The number of elements along dimension 1 of grad_source and grad_output must be the same "
+            f"(got {grad_source.shape[1]} and {grad_output.shape[1]})"
+        )
+    num_rows, num_cols = grad_source.shape
+    num_indices, num_cols = grad_output.shape
+    if not num_rows >= num_indices:
+        raise ValueError(
+            f"The number of elements along dimension 0 of grad_source must be larger than that of grad_output "
+            f"(got {num_rows} and {num_indices})!"
+        )
+    if not index.shape[0] == num_indices:
+        raise ValueError(
+            f"The number of indices and the number of elements along dimension 0 of grad_output must match "
+            f"(got {index.shape[0]} and {num_indices})!"
+        )
+    stride0, stride1 = grad_source.stride(0), grad_source.stride(1)
+    if not (grad_output.stride(0) == stride0 and grad_output.stride(1) == stride1):
+        raise ValueError(
+            f"The strides of the grad_source and grad_output tensors must match "
+            f"(got {stride0} vs. {grad_output.stride(0)}, {stride1} vs. {grad_output.stride(1)})!"
+        )
+    def grid(meta):
+        return (
+            triton.cdiv(num_indices, meta["BLOCK_SIZE_INDEX"]),
+            triton.cdiv(num_cols, meta["BLOCK_SIZE_COL"]),
+        )
+    index_select_cat_bwd_kernel[grid](
+        grad_source,
+        index,
+        grad_output,
+        num_rows,
+        num_indices,
+        num_cols,
+        grad_source.stride(0),
+        grad_source.stride(1),
+        BLOCK_SIZE_INDEX=1,
+        BLOCK_SIZE_COL=512,
+    )
+    return

.venv/lib/python3.11/site-packages/xformers/ops/_triton/k_scaled_index_add.py ADDED Viewed

	@@ -0,0 +1,365 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def scaled_index_add_fwd_kernel(
+    input_ptr,  # *Pointer* to input tensor.
+    index_ptr,  # *Pointer* to index tensor.
+    source_ptr,  # *Pointer* to source tensor.
+    scaling_ptr,  # *Pointer* to the scaling tensor.
+    alpha,
+    num_inp_indices,
+    num_src_indices,
+    num_rows,
+    num_cols,
+    stride0,  # Stride information of input and source tensor.
+    stride1,
+    stride2,
+    BLOCK_SIZE_INDEX: tl.constexpr,  # Number of indices each program should process.
+    BLOCK_SIZE_ROW: tl.constexpr,  # Number of rows each program should process.
+    BLOCK_SIZE_COL: tl.constexpr,  # Number of cols each program should process.
+    HAS_SCALING: tl.constexpr,  # Boolean indicating if the scaling factor is present.
+):
+    pid0 = tl.program_id(axis=0)  # We use 3D launch grid
+    pid1 = tl.program_id(axis=1)
+    pid2 = tl.program_id(axis=2)
+    rows = pid1 * BLOCK_SIZE_ROW + tl.arange(0, BLOCK_SIZE_ROW)
+    cols = pid2 * BLOCK_SIZE_COL + tl.arange(0, BLOCK_SIZE_COL)
+    # load source
+    source_indices = pid0 * BLOCK_SIZE_INDEX + tl.arange(0, BLOCK_SIZE_INDEX)
+    source_offsets = (
+        source_ptr
+        + source_indices[:, None, None] * stride0
+        + rows[None, :, None] * stride1
+        + cols[None, None, :] * stride2
+    )
+    source_mask = (
+        (source_indices[:, None, None] < num_src_indices)
+        & (rows[None, :, None] < num_rows)
+        & (cols[None, None, :] < num_cols)
+    )
+    source = tl.load(source_offsets, mask=source_mask).to(tl.float32)
+    # load input
+    input_indices = tl.load(
+        index_ptr + source_indices, mask=(source_indices < num_src_indices)
+    )
+    input_offsets = (
+        input_ptr
+        + input_indices[:, None, None] * stride0
+        + rows[None, :, None] * stride1
+        + cols[None, None, :] * stride2
+    )
+    x = tl.load(input_offsets, mask=source_mask).to(tl.float32)
+    # compute scaled index add and save
+    if HAS_SCALING:
+        scaling = tl.load(
+            scaling_ptr + cols[None, None, :] * stride2,
+            mask=(cols[None, None, :] < num_cols),
+        ).to(tl.float32)
+        tl.store(input_offsets, x + alpha * scaling * source, mask=source_mask)
+    else:
+        tl.store(input_offsets, x + alpha * source, mask=source_mask)
+def scaled_index_add_fwd(
+    x: torch.Tensor,
+    index: torch.Tensor,
+    source: torch.Tensor,
+    scaling: Optional[torch.Tensor],
+    alpha: float,
+):
+    if not (x.is_cuda and index.is_cuda and source.is_cuda):
+        raise ValueError(
+            "The input tensor, the index tensor and the source tensor must be of type CUDA!"
+        )
+    if not (x.ndim == 3 and source.ndim == 3):
+        raise ValueError(
+            f"The input and source must be three-dimensional (got {x.ndim} and {source.ndim})!"
+        )
+    if not x.shape[1] == source.shape[1]:
+        raise ValueError(
+            f"The number of elements along dimension 1 of the input and source must be the same "
+            f"(got {x.shape[1], } and {source.shape[1], })!"
+        )
+    if not x.shape[2] == source.shape[2]:
+        raise ValueError(
+            f"The number of elements along dimension 2 of the input and source must be the same "
+            f"(got {x.shape[2], } and {source.shape[2], })!"
+        )
+    num_inp_indices, num_rows, num_cols = x.shape
+    num_src_indices, num_rows, num_cols = source.shape
+    if not num_inp_indices >= num_src_indices:
+        raise ValueError(
+            f"The number of elements along dimension 0 of the input must be larger than that of source "
+            f"(got {num_inp_indices} and {num_src_indices})!"
+        )
+    if not index.shape[0] == num_src_indices:
+        raise ValueError(
+            f"The number of indices and source tensors must match (got {len(index)} and {len(source)})!"
+        )
+    stride0, stride1, stride2 = x.stride(0), x.stride(1), x.stride(2)
+    if not (
+        source.stride(0) == stride0
+        and source.stride(1) == stride1
+        and source.stride(2) == stride2
+    ):
+        raise ValueError(
+            f"The strides of the source and input tensors must match (got {source.stride(0)} vs. {stride0}, "
+            f"{source.stride(1)} vs. {stride1}, {source.stride(2)} vs. {stride2})!"
+        )
+    if scaling is None:
+        HAS_SCALING = False
+    else:
+        HAS_SCALING = True
+        if not scaling.is_cuda:
+            raise ValueError("The scaling tensor must be of type CUDA!")
+        if not (scaling.ndim == 1 and scaling.numel() == num_cols):
+            raise ValueError(
+                f"The scaling tensor must be a 1-dimensional tensor (got {scaling.ndim}) and its size "
+                f"must be equal to the size of dimension 2 of source (got {scaling.numel()} vs. {num_cols})."
+            )
+        if not scaling.stride(0) == stride2:
+            raise ValueError(
+                f"The stride of scaling must match the stride2 of input (got {scaling.stride(0)} vs. {stride2})"
+            )
+    if not index.ndim == 1:
+        raise ValueError(f"The index must be one-dimensional (got {index.ndim})!")
+    def grid(meta):
+        return (
+            triton.cdiv(num_src_indices, meta["BLOCK_SIZE_INDEX"]),
+            triton.cdiv(num_rows, meta["BLOCK_SIZE_ROW"]),
+            triton.cdiv(num_cols, meta["BLOCK_SIZE_COL"]),
+        )
+    scaled_index_add_fwd_kernel[grid](
+        x,
+        index,
+        source,
+        scaling,
+        alpha,
+        num_inp_indices,
+        num_src_indices,
+        num_rows,
+        num_cols,
+        x.stride(0),
+        x.stride(1),
+        x.stride(2),
+        BLOCK_SIZE_INDEX=1,
+        BLOCK_SIZE_ROW=1,
+        BLOCK_SIZE_COL=512,
+        HAS_SCALING=HAS_SCALING,
+    )
+    return
+@triton.jit
+def scaled_index_add_bwd_kernel(
+    grad_output_ptr,  # *Pointer* to input tensor.
+    grad_source_ptr,  # *Pointer* to index tensor.
+    grad_scaling_ptr,  # *Pointer* to source tensor.
+    source_ptr,  # *Pointer* to the source tensor.
+    scaling_ptr,  # *Pointer* to the scaling tensor.
+    index_ptr,
+    alpha,
+    num_inp_indices,
+    num_src_indices,
+    num_rows,
+    num_cols,
+    stride0,  # Stride information of input and source tensor.
+    stride1,
+    stride2,
+    BLOCK_SIZE_INDEX: tl.constexpr,  # Number of indices each program should process.
+    BLOCK_SIZE_ROW: tl.constexpr,  # Number of rows each program should process.
+    BLOCK_SIZE_COL: tl.constexpr,  # Number of cols each program should process.
+    HAS_SCALING: tl.constexpr,  # Boolean indicating if the scaling factor is present.
+):
+    pid0 = tl.program_id(axis=0)  # We use 3D launch grid
+    pid1 = tl.program_id(axis=1)
+    pid2 = tl.program_id(axis=2)
+    rows = pid1 * BLOCK_SIZE_ROW + tl.arange(0, BLOCK_SIZE_ROW)
+    cols = pid2 * BLOCK_SIZE_COL + tl.arange(0, BLOCK_SIZE_COL)
+    # load source
+    source_indices = pid0 * BLOCK_SIZE_INDEX + tl.arange(0, BLOCK_SIZE_INDEX)
+    source_offsets = (
+        source_ptr
+        + source_indices[:, None, None] * stride0
+        + rows[None, :, None] * stride1
+        + cols[None, None, :] * stride2
+    )
+    source_mask = (
+        (source_indices[:, None, None] < num_src_indices)
+        & (rows[None, :, None] < num_rows)
+        & (cols[None, None, :] < num_cols)
+    )
+    source = tl.load(source_offsets, mask=source_mask).to(tl.float32)
+    # load grad_output
+    grad_output_indices = tl.load(
+        index_ptr + source_indices, mask=(source_indices < num_src_indices)
+    )
+    grad_output_offsets = (
+        grad_output_ptr
+        + grad_output_indices * stride0
+        + rows[None, :, None] * stride1
+        + cols[None, None, :] * stride2
+    )
+    grad_output = tl.load(grad_output_offsets, mask=source_mask).to(tl.float32)
+    # compute gradient
+    grad_source_offsets = (
+        grad_source_ptr
+        + source_indices[:, None, None] * stride0
+        + rows[None, :, None] * stride1
+        + cols[None, None, :] * stride2
+    )
+    if HAS_SCALING:
+        scaling = tl.load(
+            scaling_ptr + cols[None, None, :] * stride2,
+            mask=(cols[None, None, :] < num_cols),
+        ).to(tl.float32)
+        tl.store(grad_source_offsets, alpha * grad_output * scaling, mask=source_mask)
+        grad_scaling_offsets = (
+            grad_scaling_ptr
+            + source_indices[:, None, None] * stride0
+            + rows[None, :, None] * stride1
+            + cols[None, None, :] * stride2
+        )
+        tl.store(grad_scaling_offsets, alpha * grad_output * source, mask=source_mask)
+    else:
+        tl.store(grad_source_offsets, alpha * grad_output, mask=source_mask)
+def scaled_index_add_bwd(
+    grad_output: torch.Tensor,
+    grad_source: torch.Tensor,
+    grad_scaling: Optional[torch.Tensor],
+    source: torch.Tensor,
+    scaling: Optional[torch.Tensor],
+    index: torch.Tensor,
+    alpha: float,
+):
+    if not (grad_output.is_cuda and grad_source.is_cuda):
+        raise ValueError(
+            "The grad_output tensor and grad_source tensor must be of type CUDA!"
+        )
+    if not (grad_output.ndim == 3 and source.ndim == 3):
+        raise ValueError(
+            f"The input and source must be three-dimensional (got {grad_output.ndim} and {source.ndim})!"
+        )
+    if not grad_output.shape[1] == source.shape[1]:
+        raise ValueError(
+            f"The number of elements along dimension 1 of the input and source must be the same "
+            f"(got {grad_output.shape[1], } and {source.shape[1], })!"
+        )
+    if not grad_output.shape[2] == source.shape[2]:
+        raise ValueError(
+            f"The number of elements along dimension 2 of the input and source must be the same "
+            f"(got {grad_output.shape[2], } and {source.shape[2], })!"
+        )
+    num_inp_indices, num_rows, num_cols = grad_output.shape
+    num_src_indices, num_rows, num_cols = source.shape
+    if not num_inp_indices >= num_src_indices:
+        raise ValueError(
+            f"The number of elements along dimension 0 of the input must be larger than that of source "
+            f"(got {num_inp_indices} and {num_src_indices})!"
+        )
+    stride0, stride1, stride2 = source.stride(0), source.stride(1), source.stride(2)
+    if not (
+        grad_output.stride(0) == stride0
+        and grad_output.stride(1) == stride1
+        and grad_output.stride(2) == stride2
+    ):
+        raise ValueError(
+            f"The strides of grad_output and source must match "
+            f"(got {grad_output.stride(0)} vs {stride0}, {grad_output.stride(1)} vs {stride1}, "
+            f"{grad_output.stride(2)} vs {stride2})!"
+        )
+    if not (
+        grad_source.stride(0) == stride0
+        and grad_source.stride(1) == stride1
+        and grad_source.stride(2) == stride2
+    ):
+        raise ValueError(
+            f"The strides of grad_source and source must match "
+            f"(got {grad_source.stride(0)} vs {stride0}, {grad_source.stride(1)} vs {stride1}, "
+            f"{grad_source.stride(2)} vs {stride2})!"
+        )
+    if scaling is not None and grad_scaling is not None:
+        HAS_SCALING = True
+        if not grad_scaling.is_cuda:
+            raise ValueError("The scaling tensor must be of type CUDA!")
+        if not (
+            grad_scaling.stride(0) == stride0
+            and grad_scaling.stride(1) == stride1
+            and grad_scaling.stride(2) == stride2
+        ):
+            raise ValueError(
+                f"The strides of grad_scaling and source must match "
+                f"(got {grad_scaling.stride(0)} vs {stride0}, {grad_scaling.stride(1)} vs {stride1}, "
+                f"{grad_scaling.stride(2)} vs {stride2})!"
+            )
+        if not scaling.stride(0) == stride2:
+            raise ValueError(
+                f"The stride of scaling must match stride2 of source (got {scaling.stride(0)} vs. {stride2})!"
+            )
+    else:
+        HAS_SCALING = False
+    def grid(meta):
+        return (
+            triton.cdiv(num_src_indices, meta["BLOCK_SIZE_INDEX"]),
+            triton.cdiv(num_rows, meta["BLOCK_SIZE_ROW"]),
+            triton.cdiv(num_cols, meta["BLOCK_SIZE_COL"]),
+        )
+    scaled_index_add_bwd_kernel[grid](
+        grad_output,
+        grad_source,
+        grad_scaling,
+        source,
+        scaling,
+        index,
+        alpha,
+        num_inp_indices,
+        num_src_indices,
+        num_rows,
+        num_cols,
+        stride0,
+        stride1,
+        stride2,
+        BLOCK_SIZE_INDEX=1,
+        BLOCK_SIZE_ROW=1,
+        BLOCK_SIZE_COL=512,
+        HAS_SCALING=HAS_SCALING,
+    )
+    return

.venv/lib/python3.11/site-packages/xformers/ops/_triton/rmsnorm_kernels.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import triton
+import triton.language as tl
+try:
+    from triton.language.extra.cuda.libdevice import rsqrt
+except ImportError:
+    try:
+        from triton.language.math import rsqrt
+    except ImportError:
+        from triton.language.libdevice import rsqrt
+@triton.jit
+def _rms_norm_kernel(
+    x_ptr,
+    h1_ptr,
+    w_ptr,
+    eps,
+    stride,
+    N_COLS: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    INCLUDE_WEIGHT: tl.constexpr,
+):
+    row = tl.program_id(0).to(tl.int64)
+    x_ptr += row * stride
+    h1_ptr += row * stride
+    _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for offset in range(0, N_COLS, BLOCK_SIZE):
+        cols = offset + tl.arange(0, BLOCK_SIZE)
+        a = tl.load(
+            x_ptr + cols, mask=cols < N_COLS, other=0.0, eviction_policy="evict_last"
+        ).to(tl.float32)
+        _mean += a * a
+    rstd = rsqrt((tl.sum(_mean, axis=0) / N_COLS) + eps)
+    for offset in range(0, N_COLS, BLOCK_SIZE):
+        cols = offset + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N_COLS
+        a = tl.load(
+            x_ptr + cols, mask=mask, other=0.0, eviction_policy="evict_first"
+        ).to(tl.float32)
+        if INCLUDE_WEIGHT:
+            w = tl.load(w_ptr + cols, mask=mask)
+            tl.store(h1_ptr + cols, a * rstd * w, mask=mask)
+        else:
+            tl.store(h1_ptr + cols, a * rstd, mask=mask)
+@triton.jit
+def _rms_norm_add_kernel(
+    x_ptr,
+    y_ptr,
+    h1_ptr,
+    w_ptr,
+    eps,
+    stride,
+    N_COLS: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    INCLUDE_WEIGHT: tl.constexpr,
+):
+    row = tl.program_id(0)
+    x_ptr += row * stride
+    y_ptr += row * stride
+    h1_ptr += row * stride
+    _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for offset in range(0, N_COLS, BLOCK_SIZE):
+        cols = offset + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N_COLS
+        ax = tl.load(
+            x_ptr + cols, mask=mask, other=0.0, eviction_policy="evict_last"
+        ).to(tl.float32)
+        ay = tl.load(
+            y_ptr + cols, mask=mask, other=0.0, eviction_policy="evict_first"
+        ).to(tl.float32)
+        a = ax + ay
+        tl.store(x_ptr + cols, a, mask=mask)
+        _mean += a * a
+    rstd = rsqrt((tl.sum(_mean, axis=0) / N_COLS) + eps)
+    for offset in range(0, N_COLS, BLOCK_SIZE):
+        cols = offset + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N_COLS
+        a = tl.load(
+            x_ptr + cols, mask=mask, other=0.0, eviction_policy="evict_first"
+        ).to(tl.float32)
+        if INCLUDE_WEIGHT:
+            w = tl.load(w_ptr + cols, mask=mask)
+            tl.store(h1_ptr + cols, a * rstd * w, mask=mask)
+        else:
+            tl.store(h1_ptr + cols, a * rstd, mask=mask)
+def _rms_norm_forward(x, attn_norm_weights, eps):
+    if not x.is_contiguous():
+        raise ValueError("data must be contiguous")
+    if attn_norm_weights is not None:
+        if not attn_norm_weights.is_contiguous():
+            raise ValueError("weights must be contiguous")
+    out = torch.empty_like(x)
+    x_arg = x.reshape(-1, x.shape[-1])
+    M, N = x_arg.shape
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+    BLOCK_SIZE = max(BLOCK_SIZE, 128)
+    BLOCK_SIZE = min(BLOCK_SIZE, 8192)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
+    with torch.cuda.device(x.device):
+        _rms_norm_kernel[(M,)](
+            x_arg,
+            out,
+            attn_norm_weights,
+            eps,
+            x_arg.stride(0),
+            N,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+            INCLUDE_WEIGHT=attn_norm_weights is not None,
+        )
+    return out
+def _rms_norm_add_forward(x, y, attn_norm_weights, eps):
+    # x, y contiguous of same shape [..., n]
+    # output of same shape, normed over the last dim.
+    if not x.is_contiguous():
+        raise ValueError("x must be contiguous")
+    if not y.is_contiguous():
+        raise ValueError("y must be contiguous")
+    if attn_norm_weights is not None:
+        if not attn_norm_weights.is_contiguous():
+            raise ValueError("weights must be contiguous")
+    out = torch.empty_like(x)
+    x_arg = x.reshape(-1, x.shape[-1])
+    y_arg = y.reshape(-1, x.shape[-1])
+    M, N = x_arg.shape
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+    BLOCK_SIZE = max(BLOCK_SIZE, 128)
+    BLOCK_SIZE = min(BLOCK_SIZE, 8192)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
+    with torch.cuda.device(x.device):
+        _rms_norm_add_kernel[(M,)](
+            x_arg,
+            y_arg,
+            out,
+            attn_norm_weights,
+            eps,
+            x_arg.stride(0),
+            N,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+            INCLUDE_WEIGHT=attn_norm_weights is not None,
+        )
+    return out

.venv/lib/python3.11/site-packages/xformers/ops/_triton/rope_padded_kernels.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import triton  # type: ignore
+import triton.language as tl  # type: ignore
+try:
+    from triton.language.extra.cuda.libdevice import pow
+except ImportError:
+    try:
+        from triton.language.math import pow
+    except ImportError:
+        from triton.language.libdevice import pow
+@triton.jit
+def _rope_padded_kernel(
+    xq,
+    xk,
+    xv,
+    out_q,
+    cache_k,
+    cache_v,
+    seqstartq,
+    seqstartk,
+    seqlenk,
+    theta,
+    linear_scale,
+    use_dynamic_scaling: tl.constexpr,
+    dynamic_old_context_len: tl.constexpr,
+    dynamic_scale_factor: tl.constexpr,
+    dynamic_low_freq_factor: tl.constexpr,
+    dynamic_high_freq_factor: tl.constexpr,
+    first_seqpos,
+    seqpos,
+    k_start: tl.constexpr,
+    v_start: tl.constexpr,
+    n_groups,
+    dim: tl.constexpr,  # dimension of each head
+    stride_xqM,
+    stride_xqG,
+    stride_xqH,
+    stride_xkM,
+    stride_xkG,
+    stride_xkH,
+    stride_xvM,
+    stride_xvG,
+    stride_xvH,
+    stride_cachekM,
+    stride_cachekG,
+    stride_cachekH,
+    stride_cachevM,
+    stride_cachevG,
+    stride_cachevH,
+    stride_seqstartq,
+    stride_seqstartk,
+    stride_seqlenk,
+    stride_outqM,
+    stride_outqG,
+    stride_outqH,
+    stride_seqpos,
+    internal_dtype: tl.constexpr,
+    # If True, seqstartq and seqstartk are not used but rather we
+    # assume that every batch element has the same number of
+    # queries (i.e. num_queries := tl.num_programs(1) )
+    # and the same cache space cache_padding_length.
+    # Always False when called below.
+    const_batch_strides: tl.constexpr,
+    # If const_batch_strides==True, the common cache length for each batch element.
+    # (Only the first seqlenk[i] elements are actually in use, and only the last
+    #  num_queries of those are actually written to.)
+    cache_padding_length,
+    # offset added to all values in seqlenk before using them.
+    # Always 0 when called below.
+    seqlenk_shift: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    adjacents: tl.constexpr,
+):
+    """
+    Each letter in this diagram is a whole row of length dim.
+     INPUT      xq        xk       xv
+        head_dim ─►
+      batch   qqqqqq      kk       vv
+        │     qqqqqq      kk       vv
+        ▼     qqqqqq      kk       vv
+    head_idx:  (goes across all heads of all 3 inputs)
+              ▲     ▲     ▲ ▲      ▲ ▲
+              │     │     │ │      │ │
+                          │        │
+              0  k_start  │v_start │n_total_heads
+                          │        │
+                          │        │
+                      k_start    v_start
+    Output is to out_q (same shape as xq), an xk-shaped part
+    of cache_k and an xv-shaped part of cache_v
+    """
+    query_pos_in_batch_elt = tl.program_id(0)
+    batch_elt = tl.program_id(1)
+    group_head_idx = tl.program_id(2)
+    group_idx = group_head_idx % n_groups
+    head_idx = group_head_idx // n_groups
+    if internal_dtype == "f32":
+        theta = theta.to(tl.float32)
+    elif internal_dtype == "f64":
+        theta = theta.to(tl.float64)
+    if const_batch_strides:
+        query_pos = query_pos_in_batch_elt + tl.num_programs(1) * batch_elt
+        end_query_pos = tl.num_programs(1) * (batch_elt + 1)
+    else:
+        query_pos = query_pos_in_batch_elt + tl.load(
+            seqstartq + batch_elt * stride_seqstartq
+        )
+        end_query_pos = tl.load(seqstartq + (batch_elt + 1) * stride_seqstartq)
+        if query_pos >= end_query_pos:
+            return
+    is_q = head_idx < k_start
+    is_v = head_idx >= v_start
+    xq += query_pos * stride_xqM + head_idx * stride_xqH + group_idx * stride_xqG
+    out_q += (
+        query_pos * stride_outqM + head_idx * stride_outqH + group_idx * stride_outqG
+    )
+    if const_batch_strides:
+        cache_start = cache_padding_length * batch_elt
+    else:
+        cache_start = tl.load(seqstartk + batch_elt * stride_seqstartk)
+    end_of_batch_elt_cache = (
+        cache_start + tl.load(seqlenk + batch_elt * stride_seqlenk) + seqlenk_shift
+    )
+    cache_pos = end_of_batch_elt_cache - (end_query_pos - query_pos)
+    if seqpos is not None:
+        seq_pos = tl.load(seqpos + query_pos * stride_seqpos)
+    else:
+        seq_pos = cache_pos - cache_start
+        if first_seqpos is not None:
+            seq_pos += tl.load(first_seqpos + batch_elt * stride_seqpos)
+    cache_k += (
+        (head_idx - k_start) * stride_cachekH
+        + cache_pos * stride_cachekM
+        + group_idx * stride_cachekG
+    )
+    xk += (
+        query_pos * stride_xkM
+        + (head_idx - k_start) * stride_xkH
+        + group_idx * stride_xkG
+    )
+    in_qk = tl.where(is_q, xq, xk)
+    out_qk = tl.where(is_q, out_q, cache_k)
+    cache_v += (
+        (head_idx - v_start) * stride_cachevH
+        + cache_pos * stride_cachevM
+        + group_idx * stride_cachevG
+    )
+    xv += (
+        query_pos * stride_xvM
+        + (head_idx - v_start) * stride_xvH
+        + group_idx * stride_xvG
+    )
+    out = tl.where(is_v, cache_v, out_qk)
+    x_in = tl.where(is_v, xv, in_qk)
+    for offset in range(0, dim // 2, BLOCK_SIZE // 2):
+        c = tl.arange(0, BLOCK_SIZE // 2)
+        powers = (offset + c) * 2.0
+        if adjacents:
+            cols_re = (offset + c) * 2
+            cols_im = cols_re + 1
+        else:
+            cols_re = offset + c
+            cols_im = cols_re + dim // 2
+        mask = cols_im < dim
+        re_x = tl.load(x_in + cols_re, mask=mask)
+        im_x = tl.load(x_in + cols_im, mask=mask)
+        # freqs = seq_pos / (theta ** (powers / dim))
+        freqs = pow(theta, powers / (-dim))
+        if use_dynamic_scaling:
+            lo_freq_wavelen = dynamic_old_context_len / dynamic_low_freq_factor
+            hi_freq_wavelen = dynamic_old_context_len / dynamic_high_freq_factor
+            wavelens = 6.28318530718 / freqs  # 2*pi
+            is_low_freq = wavelens > lo_freq_wavelen
+            freqs = tl.where(is_low_freq, freqs / dynamic_scale_factor, freqs)
+            is_mid_freq = hi_freq_wavelen <= wavelens and wavelens <= lo_freq_wavelen
+            smooth = (dynamic_old_context_len / wavelens - dynamic_low_freq_factor) / (
+                dynamic_high_freq_factor - dynamic_low_freq_factor
+            )
+            freqs = tl.where(
+                is_mid_freq,
+                (1 - smooth) * freqs / dynamic_scale_factor + smooth * freqs,
+                freqs,
+            )
+        freqs = seq_pos * freqs / linear_scale
+        sines = tl.sin(freqs)
+        cosines = tl.cos(freqs)
+        re_out = re_x * cosines - im_x * sines
+        im_out = im_x * cosines + re_x * sines
+        re_out_ = tl.where(is_v, re_x, re_out)
+        im_out_ = tl.where(is_v, im_x, im_out)
+        if internal_dtype == "f64":
+            if re_x.dtype == tl.bfloat16:
+                # triton 2.0.0 crashes if you try to convert
+                # float64 directly to bfloat16, so make an intermediate step.
+                re_out_ = re_out_.to(tl.float32)
+                im_out_ = im_out_.to(tl.float32)
+        tl.store(out + cols_re, re_out_, mask=mask)
+        tl.store(out + cols_im, im_out_, mask=mask)

.venv/lib/python3.11/site-packages/xformers/ops/_triton/tiled_matmul_kernels.py ADDED Viewed

	@@ -0,0 +1,430 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import itertools
+from typing import List, Tuple
+import torch
+import triton
+import triton.language as tl
+from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time
+def init_to_zero(*names):
+    def result(nargs):
+        for name in names:
+            nargs[name].zero_()
+    return result
+def gen_config(
+    block_m: int,
+    block_n: int,
+    block_k: int,
+    stages: int,
+    warps: int,
+    split_k: int = 1,
+    group_m: int = 8,
+) -> triton.Config:
+    """A more compact way to define a triton.Config, so it fits on one line"""
+    return triton.Config(
+        {
+            "BLOCK_M": block_m,
+            "BLOCK_N": block_n,
+            "BLOCK_K": block_k,
+            "SPLIT_K": split_k,
+            "GROUP_M": group_m,
+        },
+        num_stages=stages,
+        num_warps=warps,
+        pre_hook=init_to_zero(*[f"C{i+1}{j+1}" for i in range(3) for j in range(3)])
+        if split_k > 1
+        else init_to_zero(),
+    )
+BASIC_MATMUL_CONFIGS = [
+    gen_config(block_m=128, block_n=256, block_k=32, stages=3, warps=8),
+    gen_config(block_m=256, block_n=128, block_k=32, stages=3, warps=8),
+    gen_config(block_m=256, block_n=64, block_k=32, stages=4, warps=4),
+    gen_config(block_m=64, block_n=256, block_k=32, stages=4, warps=4),
+    gen_config(block_m=128, block_n=128, block_k=32, stages=4, warps=4),
+    gen_config(block_m=128, block_n=64, block_k=32, stages=4, warps=4),
+    gen_config(block_m=64, block_n=128, block_k=32, stages=4, warps=4),
+    gen_config(block_m=128, block_n=32, block_k=32, stages=4, warps=4),
+    gen_config(block_m=64, block_n=32, block_k=32, stages=5, warps=2),
+]
+INT8_MATMUL_CONFIGS = [
+    gen_config(block_m=128, block_n=256, block_k=128, stages=3, warps=8),
+    gen_config(block_m=256, block_n=128, block_k=128, stages=3, warps=8),
+    gen_config(block_m=256, block_n=64, block_k=128, stages=4, warps=4),
+    gen_config(block_m=64, block_n=256, block_k=128, stages=4, warps=4),
+    gen_config(block_m=128, block_n=128, block_k=128, stages=4, warps=4),
+    gen_config(block_m=128, block_n=64, block_k=64, stages=4, warps=4),
+    gen_config(block_m=64, block_n=128, block_k=64, stages=4, warps=4),
+    gen_config(block_m=128, block_n=32, block_k=64, stages=4, warps=4),
+    gen_config(block_m=64, block_n=32, block_k=64, stages=5, warps=2),
+]
+IO_BOUND_MATMUL_CONFIGS_STAGES = [2, 3, 4, 5, 6]
+IO_BOUND_MATMUL_CONFIGS_BLOCK_M = [16, 32]
+IO_BOUND_MATMUL_CONFIGS_BLOCK_K = [32, 64]
+IO_BOUND_MATMUL_CONFIGS_BLOCK_N = [32, 64, 128, 256]
+IO_BOUND_MATMUL_CONFIGS_SPLIT_K = [1, 2, 4, 8, 16]
+IO_BOUND_MATMUL_CONFIGS = [
+    gen_config(
+        block_m=block_m,
+        block_n=block_n,
+        block_k=block_k,
+        stages=stages,
+        warps=2 if block_n <= 64 else 4,
+        split_k=split_k,
+    )
+    for stages, block_m, block_k, block_n, split_k in itertools.product(
+        IO_BOUND_MATMUL_CONFIGS_STAGES,
+        IO_BOUND_MATMUL_CONFIGS_BLOCK_M,
+        IO_BOUND_MATMUL_CONFIGS_BLOCK_K,
+        IO_BOUND_MATMUL_CONFIGS_BLOCK_N,
+        IO_BOUND_MATMUL_CONFIGS_SPLIT_K,
+    )
+]
+TRITON_CONFIGS = BASIC_MATMUL_CONFIGS + INT8_MATMUL_CONFIGS + IO_BOUND_MATMUL_CONFIGS
+def our_estimate_matmul_time(
+    A11, B11, C11, M1, M2, M3, N1, N2, N3, K1, K2, K3, **kwargs
+):
+    """Call into Triton's upstream cost model, with the right args
+    The upstream function expects arguments to have certain names. Since we
+    renamed a few of them in our implementation, we rename them back.
+    At the time of writing (July 2023) the arguments that Triton expects are:
+    M, N, K, A, B, C, BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages.
+    """
+    return estimate_matmul_time(
+        M=M1 + M2 + M3, N=N1 + N2 + N3, K=K1 + K2 + K3, A=A11, B=B11, C=C11, **kwargs
+    )
+def our_early_config_prune(config, named_args, **kwargs):
+    new_named_args = named_args.copy()
+    new_named_args["M"] = named_args["M1"] + named_args["M2"] + named_args["M3"]
+    new_named_args["N"] = named_args["N1"] + named_args["N2"] + named_args["N3"]
+    new_named_args["K"] = named_args["K1"] + named_args["K2"] + named_args["K3"]
+    new_named_args["A"] = named_args["A11"]
+    new_named_args["B"] = named_args["B11"]
+    new_named_args["C"] = named_args["C11"]
+    return early_config_prune(config, new_named_args, **kwargs)
+@triton.autotune(
+    configs=TRITON_CONFIGS,
+    key=["M1", "M2", "M3", "N1", "N2", "N3", "K1", "K2", "K3"],
+    prune_configs_by={
+        "early_config_prune": our_early_config_prune,
+        "perf_model": our_estimate_matmul_time,
+        "top_k": 10,
+    },
+)
+@triton.heuristics(
+    {
+        "EVEN_K": lambda args: all(
+            k % (args["BLOCK_K"] * args["SPLIT_K"]) == 0
+            for k in [args["K1"], args["K2"], args["K3"]]
+        ),
+    }
+)
+@triton.jit()
+def _xformers_tiled_matmul_kernel(
+    A11,
+    A12,
+    A13,
+    A21,
+    A22,
+    A23,
+    A31,
+    A32,
+    A33,
+    B11,
+    B12,
+    B13,
+    B21,
+    B22,
+    B23,
+    B31,
+    B32,
+    B33,
+    C11,
+    C12,
+    C13,
+    C21,
+    C22,
+    C23,
+    C31,
+    C32,
+    C33,
+    M1,
+    M2,
+    M3,
+    N1,
+    N2,
+    N3,
+    K1,
+    K2,
+    K3,
+    stride_am1,
+    stride_am2,
+    stride_am3,
+    stride_ak1,
+    stride_ak2,
+    stride_ak3,
+    stride_bk1,
+    stride_bk2,
+    stride_bk3,
+    stride_bn1,
+    stride_bn2,
+    stride_bn3,
+    stride_cm1,
+    stride_cm2,
+    stride_cm3,
+    stride_cn1,
+    stride_cn2,
+    stride_cn3,
+    BLOCK_M: tl.constexpr,  # DO NOT CHANGE NAME: MUST MATCH PERF MODEL
+    BLOCK_N: tl.constexpr,  # DO NOT CHANGE NAME: MUST MATCH PERF MODEL
+    BLOCK_K: tl.constexpr,  # DO NOT CHANGE NAME: MUST MATCH PERF MODEL
+    GROUP_M: tl.constexpr,
+    SPLIT_K: tl.constexpr,  # DO NOT CHANGE NAME: MUST MATCH PERF MODEL
+    EVEN_K: tl.constexpr,
+    ACC_TYPE: tl.constexpr,
+):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_k = tl.program_id(1)
+    grid_m1 = tl.cdiv(M1, BLOCK_M)
+    grid_m2 = tl.cdiv(M2, BLOCK_M)
+    grid_m3 = tl.cdiv(M3, BLOCK_M)
+    grid_n1 = tl.cdiv(N1, BLOCK_N)
+    grid_n2 = tl.cdiv(N2, BLOCK_N)
+    grid_n3 = tl.cdiv(N3, BLOCK_N)
+    grid_m = grid_m1 + grid_m2 + grid_m3
+    grid_n = grid_n1 + grid_n2 + grid_n3
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+    # We use tl.where to circumvent a regression in alignment auto-detection:
+    # https://github.com/openai/triton/issues/1784
+    A1 = tl.where(pid_m < grid_m1, A11, tl.where(pid_m < grid_m1 + grid_m2, A21, A31))
+    A2 = tl.where(pid_m < grid_m1, A12, tl.where(pid_m < grid_m1 + grid_m2, A22, A32))
+    A3 = tl.where(pid_m < grid_m1, A13, tl.where(pid_m < grid_m1 + grid_m2, A23, A33))
+    B1 = tl.where(pid_n < grid_n1, B11, tl.where(pid_n < grid_n1 + grid_n2, B12, B13))
+    B2 = tl.where(pid_n < grid_n1, B21, tl.where(pid_n < grid_n1 + grid_n2, B22, B23))
+    B3 = tl.where(pid_n < grid_n1, B31, tl.where(pid_n < grid_n1 + grid_n2, B32, B33))
+    C = tl.where(
+        pid_m < grid_m1,
+        tl.where(pid_n < grid_n1, C11, tl.where(pid_n < grid_n1 + grid_n2, C12, C13)),
+        tl.where(
+            pid_m < grid_m1 + grid_m2,
+            tl.where(
+                pid_n < grid_n1, C21, tl.where(pid_n < grid_n1 + grid_n2, C22, C23)
+            ),
+            tl.where(
+                pid_n < grid_n1, C31, tl.where(pid_n < grid_n1 + grid_n2, C32, C33)
+            ),
+        ),
+    )
+    M = tl.where(pid_m < grid_m1, M1, tl.where(pid_m < grid_m1 + grid_m2, M2, M3))
+    N = tl.where(pid_n < grid_n1, N1, tl.where(pid_n < grid_n1 + grid_n2, N2, N3))
+    stride_ak = tl.where(
+        pid_m < grid_m1,
+        stride_ak1,
+        tl.where(pid_m < grid_m1 + grid_m2, stride_ak2, stride_ak3),
+    )
+    stride_bk = tl.where(
+        pid_n < grid_n1,
+        stride_bk1,
+        tl.where(pid_n < grid_n1 + grid_n2, stride_bk2, stride_bk3),
+    )
+    stride_cn = tl.where(
+        pid_m < grid_m1,
+        stride_cn1,
+        tl.where(pid_m < grid_m1 + grid_m2, stride_cn2, stride_cn3),
+    )
+    stride_cm = tl.where(
+        pid_n < grid_n1,
+        stride_cm1,
+        tl.where(pid_n < grid_n1 + grid_n2, stride_cm2, stride_cm3),
+    )
+    pid_m = tl.where(
+        pid_m < grid_m1,
+        pid_m,
+        tl.where(pid_m < grid_m1 + grid_m2, pid_m - grid_m1, pid_m - grid_m1 - grid_m2),
+    )
+    pid_n = tl.where(
+        pid_n < grid_n1,
+        pid_n,
+        tl.where(pid_n < grid_n1 + grid_n2, pid_n - grid_n1, pid_n - grid_n1 - grid_n2),
+    )
+    # do matrix multiplication
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    # pointers
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    grid_k1 = tl.cdiv(K1, BLOCK_K)
+    grid_k2 = tl.cdiv(K2, BLOCK_K)
+    grid_k3 = tl.cdiv(K3, BLOCK_K)
+    for tile in range(pid_k, grid_k1 + grid_k2 + grid_k3, SPLIT_K):
+        A = tl.where(tile < grid_k1, A1, tl.where(tile < grid_k1 + grid_k2, A2, A3))
+        B = tl.where(tile < grid_k1, B1, tl.where(tile < grid_k1 + grid_k2, B2, B3))
+        K = tl.where(tile < grid_k1, K1, tl.where(tile < grid_k1 + grid_k2, K2, K3))
+        stride_am = tl.where(
+            tile < grid_k1,
+            stride_am1,
+            tl.where(tile < grid_k1 + grid_k2, stride_am2, stride_am3),
+        )
+        stride_bn = tl.where(
+            tile < grid_k1,
+            stride_bn1,
+            tl.where(tile < grid_k1 + grid_k2, stride_bn2, stride_bn3),
+        )
+        my_tile = tl.where(
+            tile < grid_k1,
+            tile,
+            tl.where(
+                tile < grid_k1 + grid_k2, tile - grid_k1, tile - grid_k1 - grid_k2
+            ),
+        )
+        rk = my_tile * BLOCK_K + tl.arange(0, BLOCK_K)
+        Ain = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+        Bin = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+        if EVEN_K:
+            a = tl.load(Ain)
+            b = tl.load(Bin)
+        else:
+            a = tl.load(Ain, mask=rk[None, :] < K, other=0.0)
+            b = tl.load(Bin, mask=rk[:, None] < K, other=0.0)
+        acc += tl.dot(a, b, allow_tf32=False)
+    acc = acc.to(C.dtype.element_ty)
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn)
+    mask = (rm < M)[:, None] & (rn < N)[None, :]
+    # handles write-back with reduction-splitting
+    if SPLIT_K == 1:
+        tl.store(C, acc, mask=mask)
+    else:
+        tl.atomic_add(C, acc, mask=mask)
+def _check_row_or_column(row_or_col_type, row_or_col_idx, tensor_name, dim_name, vals):
+    assert len(vals) > 0
+    for pos, val in enumerate(vals[1:]):
+        assert val == vals[0], (
+            f"the tensors on {row_or_col_type} {row_or_col_idx} of the {tensor_name} "
+            f"must all have the same stride along the {dim_name} dimension, got "
+            f"{vals[0]} at position 0 and {val} at position {pos + 1}"
+        )
+    return vals[0]
+def _get_strides(
+    ts: List[List[torch.Tensor]], tensor_name, dim_0_name, dim_1_name
+) -> Tuple[List[int], List[int]]:
+    strides_0 = [
+        _check_row_or_column(
+            "column", idx, tensor_name, dim_0_name, [y.stride(0) for y in x]
+        )
+        for idx, x in enumerate(zip(*ts))
+    ]
+    strides_1 = [
+        _check_row_or_column(
+            "row", idx, tensor_name, dim_1_name, [y.stride(1) for y in x]
+        )
+        for idx, x in enumerate(ts)
+    ]
+    assert all(s == 1 for s in strides_0) or all(s == 1 for s in strides_1)
+    while len(strides_0) < 3:
+        strides_0.append(1 if strides_0[0] == 1 else 0)
+    while len(strides_1) < 3:
+        strides_1.append(1 if strides_1[0] == 1 else 0)
+    return strides_0, strides_1
+def _launch_triton_matmul(
+    a: List[List[torch.Tensor]],
+    b: List[List[torch.Tensor]],
+    c: List[List[torch.Tensor]],
+    ms: List[int],
+    ns: List[int],
+    ks: List[int],
+) -> None:
+    strides_am, strides_ak = _get_strides(a, "first operand", "m", "k")
+    strides_bk, strides_bn = _get_strides(b, "second operand", "k", "n")
+    strides_cm, strides_cn = _get_strides(c, "output", "m", "n")
+    # accumulator types
+    ACC_TYPE = (
+        tl.float32
+        if c[0][0].dtype in [torch.float16, torch.bfloat16, torch.float32]
+        else tl.int32
+    )
+    # launch kernel
+    def grid(META):
+        return (
+            sum(triton.cdiv(m, META["BLOCK_M"]) for m in ms)
+            * sum(triton.cdiv(n, META["BLOCK_N"]) for n in ns),
+            META["SPLIT_K"],
+        )
+    _xformers_tiled_matmul_kernel[grid](
+        *[
+            a[min(i, len(a) - 1)][min(j, len(a[0]) - 1)]
+            for i in range(3)
+            for j in range(3)
+        ],
+        *[
+            b[min(i, len(b) - 1)][min(j, len(b[0]) - 1)]
+            for i in range(3)
+            for j in range(3)
+        ],
+        *[
+            c[min(i, len(c) - 1)][min(j, len(c[0]) - 1)]
+            for i in range(3)
+            for j in range(3)
+        ],
+        *[ms[i] if len(ms) > i else 0 for i in range(3)],
+        *[ns[i] if len(ns) > i else 0 for i in range(3)],
+        *[ks[i] if len(ks) > i else 0 for i in range(3)],
+        *strides_am,
+        *strides_ak,
+        *strides_bk,
+        *strides_bn,
+        *strides_cm,
+        *strides_cn,
+        ACC_TYPE=ACC_TYPE,
+    )

.venv/lib/python3.11/site-packages/xformers/ops/fmha/__init__.py ADDED Viewed

	@@ -0,0 +1,893 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Any, List, Optional, Sequence, Tuple, Type, Union, cast
+import torch
+from . import (
+    attn_bias,
+    ck,
+    ck_decoder,
+    ck_splitk,
+    cutlass,
+    flash,
+    flash3,
+    triton_splitk,
+)
+from .attn_bias import (
+    VARLEN_BIASES,
+    AttentionBias,
+    BlockDiagonalMask,
+    LowerTriangularMask,
+)
+from .common import (
+    AttentionBwOpBase,
+    AttentionFwOpBase,
+    AttentionOp,
+    AttentionOpBase,
+    Context,
+    Gradients,
+    Inputs,
+    bmk2bmhk,
+)
+from .dispatch import (
+    _dispatch_bw,
+    _dispatch_fw,
+    _ensure_op_supports_or_raise,
+    _get_use_fa3,
+    _set_use_fa3,
+)
+MemoryEfficientAttentionCutlassOp = (cutlass.FwOp, cutlass.BwOp)
+MemoryEfficientAttentionCutlassFwdFlashBwOp = (cutlass.FwOp, flash.BwOp)
+MemoryEfficientAttentionFlashAttentionOp = (flash.FwOp, flash.BwOp)
+MemoryEfficientAttentionCkOp = (ck.FwOp, ck.BwOp)
+MemoryEfficientAttentionCkDecoderOp = (ck_decoder.FwOp, ck.BwOp)
+MemoryEfficientAttentionSplitKCkOp = (ck_splitk.FwOp, ck.BwOp)
+def _deserialize_bias(attn_bias_ctx, attn_bias_tensor: Optional[torch.Tensor]) -> Any:
+    if attn_bias_tensor is None:
+        return attn_bias_ctx
+    return attn_bias_tensor
+# Note: `torch.compile` only allows custom autograd functions
+# to accept a subset of types. Therefore we serialize `op` objects
+# to `str` before entering the function, and unserialize them inside.
+# See also: https://github.com/pytorch/pytorch/issues/118395
+_OPS_LOOKUP = {
+    flash.FwOp.NAME: flash.FwOp,
+    flash.BwOp.NAME: flash.BwOp,
+}
+def _serialize_op(op):
+    if op is not None and op.NAME in _OPS_LOOKUP:
+        return op.NAME
+    return op
+def _unserialize_op(op):
+    if isinstance(op, str):
+        return _OPS_LOOKUP[op]
+    return op
+class _fMHA(torch.autograd.Function):
+    @staticmethod
+    # type: ignore
+    def forward(ctx, op_fw, op_bw, *args: Any) -> Any:
+        inp = Inputs(*args)
+        op_fw = _unserialize_op(op_fw)
+        op_bw = _unserialize_op(op_bw)
+        out, op_ctx = _memory_efficient_attention_forward_requires_grad(
+            inp=inp, op=op_fw
+        )
+        # Saving attn_bias is a bit complicated, as the
+        # torch part should go in `save_for_backward`
+        if isinstance(inp.attn_bias, torch.Tensor):
+            attn_bias_tensor = inp.attn_bias
+            attn_bias_ctx = None
+        else:
+            attn_bias_tensor = None
+            attn_bias_ctx = inp.attn_bias
+        ctx.save_for_backward(
+            inp.query,
+            inp.key,
+            inp.value,
+            op_ctx.out,
+            op_ctx.lse,
+        )
+        ctx.rng_state = op_ctx.rng_state
+        ctx.attn_bias_tensor = attn_bias_tensor
+        if op_ctx.op_bw is not None:
+            if op_bw is not None and op_bw is not op_ctx.op_bw:
+                raise ValueError(
+                    f"Specified op_bw={op_bw.NAME}, but forward op "
+                    f"can only run with op_bw={op_ctx.op_bw.NAME}. Please set op_bw=None."
+                )
+            op_bw = op_ctx.op_bw
+        if (
+            op_bw is not None
+            and isinstance(inp.attn_bias, VARLEN_BIASES)
+            and inp.attn_bias.q_seqinfo.seqstart.shape[0] > 2
+            and op_bw.VARLEN_LSE_PACKED != op_fw.VARLEN_LSE_PACKED
+        ):
+            raise ValueError(
+                f"Specified op_bw={op_bw.NAME} is not compatible with the "
+                f"op_fw={op_fw.NAME}, because they use different format of logsumexp. "
+                f"NOTE: This is new with xFormers 0.0.28"
+            )
+        if op_bw is None and (
+            inp.query.requires_grad or inp.key.requires_grad or inp.value.requires_grad
+        ):
+            varlen_lse_packed = _detect_lse_packed_or_raise(op_ctx.lse, inp)
+            if varlen_lse_packed is not None and op_fw is not None:
+                assert (
+                    op_fw.VARLEN_LSE_PACKED == varlen_lse_packed
+                ), f"{op_fw.NAME}: wrong value for `VARLEN_LSE_PACKED` ?"
+            # NOTE: We need to check tensor strides to decide which operator we run in the BW pass.
+            # Unfortunately, PyTorch only allows to call this function during the FW pass, so
+            # we decide the operator to use now.
+            op_bw = _dispatch_bw(inp, varlen_lse_packed=varlen_lse_packed)
+        ctx.op_fw = op_fw
+        ctx.op_bw = op_bw
+        ctx.p = inp.p
+        # This allows to create gradients from a single storage,
+        # to avoid a "cat" in the BW pass.
+        # The heuristic is approximative, but:
+        # (1) It's not a big issue to create a shared storage
+        # (2) The heuristic needs to pass `torch.compile`
+        #  (this is also why we run it in the FW pass, the BW pass is stricter)
+        ctx.qkv_share_storage = (
+            inp.query.shape[0] == inp.key.shape[0]
+            and inp.query.shape[-1] == inp.value.shape[-1]
+            and inp.query.stride(-2)
+            == (inp.key.shape[-1] + inp.query.shape[-1] + inp.value.shape[-1])
+        )
+        ctx.scale = inp.scale
+        ctx.attn_bias_ctx = attn_bias_ctx
+        ctx.n_args = len(args)
+        return out, op_ctx.lse
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad, grad_lse):
+        # Re-create context
+        query, key, value, out, lse = ctx.saved_tensors
+        attn_bias_tensor = ctx.attn_bias_tensor
+        rng_state = ctx.rng_state
+        inp = Inputs(
+            query=query,
+            key=key,
+            value=value,
+            attn_bias=_deserialize_bias(ctx.attn_bias_ctx, attn_bias_tensor),
+            p=ctx.p,
+            scale=ctx.scale,
+        )
+        op_ctx = Context(
+            lse=lse,
+            out=out,
+            rng_state=rng_state,
+        )
+        grads = _memory_efficient_attention_backward(
+            ctx=op_ctx,
+            inp=inp,
+            grad=grad,
+            op=ctx.op_bw,
+            _skip_op_checks=True,
+        )
+        return (None, None, grads.dq, grads.dk, grads.dv, grads.db) + (None,) * (
+            ctx.n_args - 2
+        )
+def memory_efficient_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+    *,
+    op: Optional[AttentionOp] = None,
+    output_dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    """Implements the memory-efficient attention mechanism following
+    `"Self-Attention Does Not Need O(n^2) Memory" <http://arxiv.org/abs/2112.05682>`_.
+    :Inputs shape:
+    - Input tensors must be in format ``[B, M, H, K]``, where B is the batch size, M \
+        the sequence length, H the number of heads, and K the embeding size per head
+    - If inputs have dimension 3, it is assumed that the dimensions are ``[B, M, K]`` and ``H=1``
+    - Inputs can also be of dimension 5 with GQA - see note below
+    - Inputs can be non-contiguous - we only require the last dimension's stride to be 1
+    :Equivalent pytorch code:
+    .. code-block:: python
+        scale = 1.0 / query.shape[-1] ** 0.5
+        query = query * scale
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        attn = query @ key.transpose(-2, -1)
+        if attn_bias is not None:
+            attn = attn + attn_bias
+        attn = attn.softmax(-1)
+        attn = F.dropout(attn, p)
+        attn = attn @ value
+        return attn.transpose(1, 2)
+    :Examples:
+    .. code-block:: python
+        import xformers.ops as xops
+        # Compute regular attention
+        y = xops.memory_efficient_attention(q, k, v)
+        # With a dropout of 0.2
+        y = xops.memory_efficient_attention(q, k, v, p=0.2)
+        # Causal attention
+        y = xops.memory_efficient_attention(
+            q, k, v,
+            attn_bias=xops.LowerTriangularMask()
+        )
+    :Supported hardware:
+        NVIDIA GPUs with compute capability above 6.0 (P100+), datatype ``f16``, ``bf16`` and ``f32``.
+    :EXPERIMENTAL: Using with Multi Query Attention (MQA) and Grouped Query Attention (GQA):
+        MQA/GQA is an experimental feature supported only for the forward pass.
+        If you have 16 heads in query, and 2 in key/value, you can provide 5-dim tensors
+        in the ``[B, M, G, H, K]`` format, where ``G`` is the number of head groups (here 2), and
+        ``H`` is the number of heads per group (8 in the example).
+        Please note that xFormers will not automatically broadcast the inputs, so you will need
+        to broadcast it manually before calling `memory_efficient_attention`.
+    :GQA/MQA example:
+    .. code-block:: python
+        import torch
+        import xformers.ops as xops
+        B, M, K = 3, 32, 128
+        kwargs = dict(device="cuda", dtype=torch.float16)
+        q = torch.randn([B, M, 8, K], **kwargs)
+        k = torch.randn([B, M, 2, K], **kwargs)
+        v = torch.randn([B, M, 2, K], **kwargs)
+        out_gqa = xops.memory_efficient_attention(
+            q.reshape([B, M, 2, 4, K]),
+            k.reshape([B, M, 2, 1, K]).expand([B, M, 2, 4, K]),
+            v.reshape([B, M, 2, 1, K]).expand([B, M, 2, 4, K]),
+        )
+    Raises:
+        NotImplementedError: if there is no operator available to compute the MHA
+        ValueError: if inputs are invalid
+    :parameter query: Tensor of shape ``[B, Mq, H, K]``
+    :parameter key: Tensor of shape ``[B, Mkv, H, K]``
+    :parameter value: Tensor of shape ``[B, Mkv, H, Kv]``
+    :parameter attn_bias: Bias to apply to the attention matrix - defaults to no masking. \
+        For common biases implemented efficiently in xFormers, see :attr:`xformers.ops.fmha.attn_bias.AttentionBias`. \
+        This can also be a :attr:`torch.Tensor` for an arbitrary mask (slower).
+    :parameter p: Dropout probability. Disabled if set to ``0.0``
+    :parameter scale: Scaling factor for ``Q @ K.transpose()``. If set to ``None``, the default \
+        scale (q.shape[-1]**-0.5) will be used.
+    :parameter op: The operators to use - see :attr:`xformers.ops.AttentionOpBase`. \
+        If set to ``None`` (recommended), xFormers \
+        will dispatch to the best available operator, depending on the inputs \
+        and options.
+    :return: multi-head attention Tensor with shape ``[B, Mq, H, Kv]``
+    """
+    return _memory_efficient_attention(
+        Inputs(
+            query=query,
+            key=key,
+            value=value,
+            p=p,
+            attn_bias=attn_bias,
+            scale=scale,
+            output_dtype=output_dtype,
+        ),
+        op=op,
+    )
+torch.library.define(
+    "xformer::memory_efficient_attention_forward",
+    "(Tensor q, Tensor k, Tensor v, Tensor? b = None, float? p = 0.0, float? scale = None) -> Tensor",
+)
+@torch.library.impl("xformer::memory_efficient_attention_forward", "Meta")
+def memory_efficient_attention_forward_meta(q, k, v):
+    return q.new_empty(q.shape)
+# torch.compile has issue when tracing through op dispatch and ensure_op_support
+# so provide a wrapper to register it as a custom torch library op.
+@torch.library.impl("xformer::memory_efficient_attention_forward", "CUDA")
+def memory_efficient_attention_forward_torch_wrapper(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+) -> torch.Tensor:
+    """
+    This provides a torch-compilable wrapper op to
+    memory_efficient_attention_forward in certain special cases.
+    Note that the following are not supported
+        - `op` input (?)
+        - certain attn_bias types (?)
+        - output_dtype
+        - K != Kv
+    """
+    return memory_efficient_attention_forward(
+        query,
+        key,
+        value,
+        attn_bias,
+        p,
+        scale,
+    )
+def memory_efficient_attention_forward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+    *,
+    op: Optional[Type[AttentionFwOpBase]] = None,
+    output_dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    """
+    Calculates the forward pass of :attr:`xformers.ops.memory_efficient_attention`.
+    """
+    return _memory_efficient_attention_forward(
+        Inputs(
+            query=query,
+            key=key,
+            value=value,
+            p=p,
+            attn_bias=attn_bias,
+            scale=scale,
+            output_dtype=output_dtype,
+        ),
+        op=op,
+    )
+def memory_efficient_attention_forward_requires_grad(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+    *,
+    op: Optional[Type[AttentionFwOpBase]] = None,
+    output_dtype: Optional[torch.dtype] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Returns a tuple (output, lse), where `lse` can be used to compute the backward pass later.
+    See :attr:`xformers.ops.memory_efficient_attention` for an explanation of the arguments
+    See :attr:`xformers.ops.memory_efficient_attention_backward` for running the backward pass
+    """
+    if p != 0.0:
+        raise NotImplementedError(
+            "dropout is not supported on the non-autograd API."
+            " If you want to use dropout, please call `memory_efficient_attention` directly"
+        )
+    out, ctx = _memory_efficient_attention_forward_requires_grad(
+        Inputs(
+            query=query,
+            key=key,
+            value=value,
+            p=p,
+            attn_bias=attn_bias,
+            scale=scale,
+            output_dtype=output_dtype,
+        ),
+        op=op,
+    )
+    return out, ctx.lse
+def memory_efficient_attention_backward(
+    grad: torch.Tensor,
+    output: torch.Tensor,
+    lse: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+    *,
+    op: Optional[Type[AttentionBwOpBase]] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Computes the gradient of the attention.
+    Returns a tuple (dq, dk, dv)
+    See :attr:`xformers.ops.memory_efficient_attention` for an explanation of the arguments.
+    `lse` is the tensor returned by
+    :attr:`xformers.ops.memory_efficient_attention_forward_requires_grad`
+    """
+    if p != 0.0:
+        raise NotImplementedError(
+            "dropout is not supported on the non-autograd API."
+            " If you want to use dropout, please call `memory_efficient_attention` directly"
+        )
+    gradients = _memory_efficient_attention_backward(
+        Context(out=output, lse=lse),
+        Inputs(
+            query=query, key=key, value=value, p=p, attn_bias=attn_bias, scale=scale
+        ),
+        grad,
+        op=op,
+    )
+    return (gradients.dq, gradients.dk, gradients.dv)
+def _memory_efficient_attention(
+    inp: Inputs, op: Optional[AttentionOp] = None
+) -> torch.Tensor:
+    # fast-path that doesn't require computing the logsumexp for backward computation
+    if all(x.requires_grad is False for x in [inp.query, inp.key, inp.value]):
+        return _memory_efficient_attention_forward(
+            inp, op=op[0] if op is not None else None
+        )
+    output_shape = inp.normalize_bmhk()
+    op_fw = _serialize_op(op[0] if op is not None else None)
+    op_bw = _serialize_op(op[1] if op is not None else None)
+    return _fMHA.apply(
+        op_fw, op_bw, inp.query, inp.key, inp.value, inp.attn_bias, inp.p, inp.scale
+    )[0].reshape(output_shape)
+def _memory_efficient_attention_forward(
+    inp: Inputs, op: Optional[Type[AttentionFwOpBase]]
+) -> torch.Tensor:
+    inp.validate_inputs()
+    output_shape = inp.normalize_bmhk()
+    if op is None:
+        op = _dispatch_fw(inp, False)
+    else:
+        _ensure_op_supports_or_raise(ValueError, "memory_efficient_attention", op, inp)
+    out, *_ = op.apply(inp, needs_gradient=False)
+    return out.reshape(output_shape)
+def _memory_efficient_attention_forward_requires_grad(
+    inp: Inputs, op: Optional[Type[AttentionFwOpBase]]
+) -> Tuple[torch.Tensor, Context]:
+    inp.validate_inputs()
+    output_shape = inp.normalize_bmhk()
+    if op is None:
+        op = _dispatch_fw(inp, True)
+    else:
+        _ensure_op_supports_or_raise(ValueError, "memory_efficient_attention", op, inp)
+    out = op.apply(inp, needs_gradient=True)
+    assert out[1] is not None
+    return (out[0].reshape(output_shape), out[1])
+def _detect_lse_packed_or_raise(lse: torch.Tensor, inp: Inputs) -> Optional[bool]:
+    """
+    Detects the LSE format if we're in a varlen case.
+    Returns `None` if the format is not relevant (eg not varlen)
+    Raises an exception if the `lse` has the wrong shape
+    """
+    shape_mismatch_err = (
+        "Input tensors have incompatible shapes.\n"
+        f"  lse.shape    : {lse.shape}\n"
+        f"  query.shape  : {inp.query.shape}\n"
+        f"  attn_bias    : {type(inp.attn_bias)}"
+    )
+    # 1. Check ndim & head dimensions
+    # In any case, LSE should be [*, *GH]
+    if lse.ndim != (inp.query.ndim - 1) or lse.shape[1:-1] != inp.query.shape[2:-1]:
+        raise ValueError(shape_mismatch_err)
+    lse_bm = [lse.shape[0], lse.shape[-1]]
+    lse_packed_shape = [inp.query.shape[0], inp.query.shape[1]]
+    lse_packed = lse_bm[0] == lse_packed_shape[0] and lse_bm >= lse_packed_shape
+    # 2. Check correctness for varlen biases with query.shape = [1, M, *GH, K]
+    # Either [1, *GH, M] (packed)
+    # Or     [num_seq, *GH, Mq] .. with `Mq >= max_q` (padded)
+    if isinstance(inp.attn_bias, VARLEN_BIASES):
+        si = inp.attn_bias.q_seqinfo
+        lse_padded_shape = [si.seqstart.shape[0] - 1, si.max_seqlen]
+        lse_padded = lse_bm[0] == lse_padded_shape[0] and lse_bm >= lse_padded_shape
+        if lse_packed and lse_padded:
+            return None
+        elif lse_packed:
+            return True
+        elif lse_padded:
+            return False
+        raise ValueError(shape_mismatch_err)
+    # 3. For non-varlen, shape must be [B, *GH] with query.shape=[B, M, *GH, K]
+    if not lse_packed:
+        raise ValueError(shape_mismatch_err)
+    return None
+def _memory_efficient_attention_backward(
+    ctx: Context,
+    inp: Inputs,
+    grad: torch.Tensor,
+    op: Optional[Type[AttentionBwOpBase]],
+    *,
+    _skip_op_checks: bool = False,
+) -> Gradients:
+    """Warning: grad/ctx.out is potentially in BMK format"""
+    inp.validate_inputs()
+    if grad.ndim != inp.query.ndim or grad.ndim != ctx.out.ndim:
+        raise ValueError(
+            "All tensors should be either in BMK (ndim=3) or BMHK (ndim=4) format. \n"
+            f"grad.shape : {grad.shape} \n"
+            f"out.shape  : {ctx.out.shape} \n"
+            f"query.shape: {inp.query.shape}"
+        )
+    shape_dq, shape_dk, shape_dv = tuple(
+        x.shape for x in (inp.query, inp.key, inp.value)
+    )
+    inp.normalize_bmhk()
+    varlen_lse_packed = _detect_lse_packed_or_raise(ctx.lse, inp)
+    grad = bmk2bmhk(grad, 1)
+    ctx.out = bmk2bmhk(ctx.out, 1)
+    if op is None:
+        op = _dispatch_bw(inp, varlen_lse_packed=varlen_lse_packed)
+    elif not _skip_op_checks:
+        _ensure_op_supports_or_raise(
+            ValueError, "memory_efficient_attention_backward", op, inp
+        )
+        if varlen_lse_packed is not None and varlen_lse_packed != op.VARLEN_LSE_PACKED:
+            raise ValueError(
+                f"Wrong LSE format for {op.NAME} in variable seqlen case. "
+                f"Double-check that the BW operator {op.NAME} is compatible "
+                f"with the operator used in the FW pass."
+            )
+    grads = op.apply(ctx, inp, grad)
+    grads.dq = grads.dq.reshape(shape_dq)
+    grads.dk = grads.dk.reshape(shape_dk)
+    grads.dv = grads.dv.reshape(shape_dv)
+    return grads
+def memory_efficient_attention_partial(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+    *,
+    op: Optional[Union[AttentionOp, Type[AttentionFwOpBase]]] = None,
+    output_dtype: Optional[torch.dtype] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Returns a tuple (output, lse), where `output` is the attention in the style of
+    memory_efficient_attention, and  `lse` is extra data, a log-sum-exp.
+    The outputs of calls to this with the same query and separate keys and values
+    can be merged with merge_attentions to obtain the attention of the queries
+    against the disjoint union of the keys and values.
+    Warning: The backward pass of this function is quite restricted. In particular
+    we assume that in the forward pass the outputs were only used in merge_attention
+    calculations, and that LSEs weren't used anywhere except in merge attentions.
+    """
+    if p != 0.0:
+        raise NotImplementedError("dropout is not supported.")
+    fwop: Optional[Type[AttentionFwOpBase]] = op[0] if isinstance(op, tuple) else op
+    inp = Inputs(
+        query=query,
+        key=key,
+        value=value,
+        p=p,
+        attn_bias=attn_bias,
+        scale=scale,
+        output_dtype=output_dtype,
+        is_partial=True,
+    )
+    is_grad = torch.is_grad_enabled() and any(
+        x.requires_grad for x in [query, key, value]
+    )
+    if not is_grad:
+        out, ctx = _memory_efficient_attention_forward_requires_grad(
+            inp,
+            op=fwop,
+        )
+        return out, ctx.lse
+    if query.ndim == 5:
+        raise ValueError("gradients not supported for 5D tensors")
+    if isinstance(op, tuple):
+        op_fw = _serialize_op(op[0])
+        op_bw = _serialize_op(op[1])
+    elif op is None:
+        op_fw = op_bw = None
+    else:
+        op_fw = _serialize_op(op)
+        op_bw = None
+    return _fMHA.apply(
+        op_fw,
+        op_bw,
+        inp.query,
+        inp.key,
+        inp.value,
+        inp.attn_bias,
+        inp.p,
+        inp.scale,
+        inp.output_dtype,
+        inp.is_partial,
+    )
+def merge_attentions(
+    attn_split: Union[torch.Tensor, Sequence[torch.Tensor]],
+    lse_split: Union[torch.Tensor, Sequence[torch.Tensor]],
+    write_lse: bool = True,
+    output_dtype: Optional[torch.dtype] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Combine attention output computed on different parts of K/V for the same
+    query to get attention on the whole K/V. See https://arxiv.org/abs/2402.05099
+    The result is equal to
+        Out_full = (Out1 * exp(LSE1) + Out2 * exp(LSE2) + ...) / (exp(LSE1) + exp(LSE2) + ...)
+        LSE_full = log(exp(LSE1) + exp(LSE2) + ...)
+    Args:
+        attn_split: attention outputs for chunks,
+            either as a list of tensors of shapes [B, M, G, H, Kq] or [B, M, H, Kq]
+            or as a single tensor of shape [num_chunks, B, M, G, H, Kq]
+            or [num_chunks, B, M, H, Kq]
+        lse_split: LSE for chunks,
+            either as a list of tensors of shapes [B, G, H, M] or [B, H, M]
+            or as a single tensor of shape [num_chunks, B, G, H, M] or [num_chunks, B, H, M]
+        write_lse: whether to output LSE
+        output_dtype: dtype of attn_out
+    Returns:
+        attn_out: [B, M, G, H, Kq] or [B, M, H, Kq]
+        lse_out: [B, G, H, M] or [B, H, M] if write_lse
+                 or None otherwise
+    """
+    attn_is_concat = isinstance(attn_split, torch.Tensor)
+    lse_is_concat = isinstance(lse_split, torch.Tensor)
+    attn_requires_grad = (
+        attn_split.requires_grad  # type: ignore
+        if attn_is_concat
+        else any(x.requires_grad for x in attn_split)
+    )
+    lse_requires_grad = (
+        lse_split.requires_grad  # type: ignore
+        if lse_is_concat
+        else any(x.requires_grad for x in lse_split)
+    )
+    requires_grad = torch.is_grad_enabled() and (
+        attn_requires_grad or lse_requires_grad
+    )
+    if requires_grad and not write_lse:
+        raise ValueError("write_lse should be true if inputs require gradients.")
+    concat_path = attn_is_concat and lse_is_concat and not requires_grad
+    if concat_path:
+        attn_split = cast(torch.Tensor, attn_split)
+        lse_split = cast(torch.Tensor, lse_split)
+        if attn_split.ndim != lse_split.ndim + 1:
+            raise ValueError(
+                f"Incompatible input shapes: {attn_split.shape=}, {lse_split.shape=}"
+            )
+        is_bmhk = attn_split.ndim == 5
+        if is_bmhk:
+            attn_split = attn_split.unsqueeze(3)
+            lse_split = lse_split.unsqueeze(2)
+        num_chunks, B, M, G, H, Kq = attn_split.shape
+        num_chunks1, B1, G1, H1, M1 = lse_split.shape
+        if B != B1 or G != G1 or H != H1 or num_chunks != num_chunks1 or M != M:
+            raise ValueError(
+                f"Incompatible input shapes: {attn_split.shape=} {lse_split.shape=} "
+                f"{B}/{B1}, {G}/{G1}, {H}/{H1}, {num_chunks}/{num_chunks1}, {M}/{M}"
+            )
+        attn_split = attn_split.permute(1, 3, 4, 0, 2, 5)
+        lse_split = lse_split.permute(1, 2, 3, 0, 4)
+        device = attn_split.device
+        attn_dtype = attn_split.dtype
+        lse_dtype = lse_split.dtype
+    else:
+        if attn_is_concat:
+            attn_split = attn_split.unbind(0)  # type: ignore
+        if lse_is_concat:
+            lse_split = lse_split.unbind(0)  # type: ignore
+        num_chunks = len(attn_split)
+        if len(lse_split) != num_chunks:
+            raise ValueError(
+                f"Incompatible number of LSE and attention chunks: {len(attn_split)=}, {len(lse_split)=}"
+            )
+        attn_unsqueezed = []
+        lse_unsqueezed = []
+        is_bmhk = False
+        for i in range(num_chunks):
+            if attn_split[i].ndim != lse_split[i].ndim + 1:
+                raise ValueError(
+                    f"Incompatible input shapes for chunk {i}: {attn_split[i].shape=}, {lse_split[i].shape=}"
+                )
+            is_bmhk = attn_split[i].ndim == 4
+            if is_bmhk:
+                attn_unsqueezed.append(attn_split[i].unsqueeze(2))
+                lse_unsqueezed.append(lse_split[i].unsqueeze(1))
+            else:
+                attn_unsqueezed.append(attn_split[i])
+                lse_unsqueezed.append(lse_split[i])
+        attn_split, lse_split = attn_unsqueezed, lse_unsqueezed
+        B, M, G, H, Kq = attn_split[0].shape
+        B1, G1, H1, M1 = lse_split[0].shape
+        if B != B1 or G != G1 or H != H1 or M != M:
+            raise ValueError(
+                f"Incompatible input shapes: {attn_split[0].shape=}, {lse_split[0].shape=} "
+                f"{B}/{B1}, {G}/{G1}, {H}/{H1}, {M}/{M}"
+            )
+        for i in range(num_chunks):
+            if attn_split[i].shape != (B, M, G, H, Kq):
+                raise ValueError(
+                    f"Incompatible input shapes for attention chunk {i}: "
+                    f"{attn_split[i].shape=}, {(B, M, G, H, Kq)=}"
+                )
+            if lse_split[i].shape != (B, G, H, M):
+                raise ValueError(
+                    f"Incompatible input shapes for LSE chunk {i}: "
+                    f"{lse_split[i].shape=}, {(B, G, H, M)=}"
+                )
+            attn_split[i] = attn_split[i].permute(0, 2, 3, 1, 4)  # to (B, G, H, M, Kq)
+        device = attn_split[0].device
+        attn_dtype = attn_split[0].dtype
+        lse_dtype = lse_split[0].dtype
+    attn_out = torch.empty(
+        B,
+        M,
+        G,
+        H,
+        Kq,
+        device=device,
+        dtype=output_dtype or attn_dtype,
+        requires_grad=requires_grad,
+    )
+    if write_lse:
+        lse_out = torch.empty(
+            B, G, H, M, device=device, dtype=lse_dtype, requires_grad=requires_grad
+        )
+    else:
+        lse_out = None
+    if concat_path:
+        triton_splitk.merge_attentions(attn_out, lse_out, attn_split, lse_split)  # type: ignore
+    else:
+        attn_out, lse_out = _MergeAttentions.apply(attn_out, lse_out, *attn_split, *lse_split)  # type: ignore
+    if is_bmhk:
+        attn_out = attn_out[:, :, 0]
+        if lse_out is not None:
+            lse_out = lse_out[:, 0]
+    return attn_out, lse_out
+class _MergeAttentions(torch.autograd.Function):
+    @staticmethod
+    # type: ignore
+    def forward(
+        ctx, attn_out: torch.Tensor, lse_out: torch.Tensor, *inputs: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        num_chunks = len(inputs) // 2
+        attn_split, lse_split = inputs[:num_chunks], inputs[num_chunks:]
+        triton_splitk.merge_attentions_varargs(attn_out, lse_out, attn_split, lse_split)
+        ctx.save_for_backward(
+            attn_out,
+            lse_out,
+            *inputs,
+        )
+        return attn_out, lse_out
+    @staticmethod
+    # type: ignore
+    def backward(
+        ctx, grad_attn: torch.Tensor, grad_lse: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], ...]:
+        out, lse, *inputs = ctx.saved_tensors
+        num_chunks = len(inputs) // 2
+        attn_split, lse_split = inputs[:num_chunks], inputs[num_chunks:]
+        dattn, dlse = triton_splitk.merge_attentions_varargs_backward(
+            attn_split,
+            lse_split,
+            out,
+            lse,
+            grad_attn,
+            grad_lse,
+        )
+        ret = [None, None] + dattn + dlse
+        return tuple(ret)
+ALL_FW_OPS: List[Type[AttentionFwOpBase]] = [
+    cutlass.FwOp if torch.version.cuda else ck.FwOp,
+    flash.FwOp,
+    flash3.FwOp,
+    triton_splitk.FwOp,
+]
+ALL_BW_OPS: List[Type[AttentionBwOpBase]] = [
+    cutlass.BwOp if torch.version.cuda else ck.BwOp,
+    flash.BwOp,
+    flash3.BwOp,
+]
+__all__ = [
+    "AttentionBias",
+    "AttentionOp",
+    "AttentionOpBase",
+    "LowerTriangularMask",
+    "MemoryEfficientAttentionCutlassFwdFlashBwOp",
+    "MemoryEfficientAttentionCutlassOp",
+    "MemoryEfficientAttentionFlashAttentionOp",
+    "memory_efficient_attention",
+    "MemoryEfficientAttentionCkOp",
+    "MemoryEfficientAttentionCkDecoderOp",
+    "ALL_FW_OPS",
+    "ALL_BW_OPS",
+    "attn_bias",
+    "_get_use_fa3",
+    "_set_use_fa3",
+    "BlockDiagonalMask",
+]

.venv/lib/python3.11/site-packages/xformers/ops/fmha/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (36.5 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/fmha/__pycache__/attn_bias.cpython-311.pyc ADDED Viewed

Binary file (84.4 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/fmha/__pycache__/ck.cpython-311.pyc ADDED Viewed

Binary file (19.6 kB). View file

.venv/lib/python3.11/site-packages/xformers/ops/fmha/__pycache__/ck_decoder.cpython-311.pyc ADDED Viewed

Binary file (6.87 kB). View file