danieldk HF Staff commited on Jan 7

Commit

8e88928

verified ·

1 Parent(s): d033399

Build uploaded using `kernels`.

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch210-cxx11-cu126-x86_64-linux/__init__.py +0 -14
build/torch210-cxx11-cu126-x86_64-linux/_mamba_ssm_b2a7fd5.abi3.so +0 -3
build/torch210-cxx11-cu126-x86_64-linux/_ops.py +0 -9
build/torch210-cxx11-cu126-x86_64-linux/distributed/__init__.py +0 -0
build/torch210-cxx11-cu126-x86_64-linux/distributed/distributed_utils.py +0 -144
build/torch210-cxx11-cu126-x86_64-linux/distributed/tensor_parallel.py +0 -296
build/torch210-cxx11-cu126-x86_64-linux/mamba_ssm/__init__.py +0 -26
build/torch210-cxx11-cu126-x86_64-linux/metadata.json +0 -1
build/torch210-cxx11-cu126-x86_64-linux/models/__init__.py +0 -0
build/torch210-cxx11-cu126-x86_64-linux/models/config_mamba.py +0 -18
build/torch210-cxx11-cu126-x86_64-linux/models/mixer_seq_simple.py +0 -309
build/torch210-cxx11-cu126-x86_64-linux/modules/__init__.py +0 -0
build/torch210-cxx11-cu126-x86_64-linux/modules/block.py +0 -107
build/torch210-cxx11-cu126-x86_64-linux/modules/mamba2.py +0 -502
build/torch210-cxx11-cu126-x86_64-linux/modules/mamba2_simple.py +0 -229
build/torch210-cxx11-cu126-x86_64-linux/modules/mamba_simple.py +0 -339
build/torch210-cxx11-cu126-x86_64-linux/modules/mha.py +0 -294
build/torch210-cxx11-cu126-x86_64-linux/modules/mlp.py +0 -34
build/torch210-cxx11-cu126-x86_64-linux/modules/ssd_minimal.py +0 -111
build/torch210-cxx11-cu126-x86_64-linux/ops/__init__.py +0 -0
build/torch210-cxx11-cu126-x86_64-linux/ops/selective_scan_interface.py +0 -446
build/torch210-cxx11-cu126-x86_64-linux/ops/triton/__init__.py +0 -0
build/torch210-cxx11-cu126-x86_64-linux/ops/triton/k_activations.py +0 -169
build/torch210-cxx11-cu126-x86_64-linux/ops/triton/layer_norm.py +0 -1113
build/torch210-cxx11-cu126-x86_64-linux/ops/triton/layernorm_gated.py +0 -437
build/torch210-cxx11-cu126-x86_64-linux/ops/triton/selective_state_update.py +0 -285
build/torch210-cxx11-cu126-x86_64-linux/ops/triton/softplus.py +0 -15
build/torch210-cxx11-cu126-x86_64-linux/ops/triton/ssd_bmm.py +0 -262
build/torch210-cxx11-cu126-x86_64-linux/ops/triton/ssd_chunk_scan.py +0 -0
build/torch210-cxx11-cu126-x86_64-linux/ops/triton/ssd_chunk_state.py +0 -997
build/torch210-cxx11-cu126-x86_64-linux/ops/triton/ssd_combined.py +0 -998
build/torch210-cxx11-cu126-x86_64-linux/ops/triton/ssd_state_passing.py +0 -348
build/torch210-cxx11-cu126-x86_64-linux/utils/__init__.py +0 -0
build/torch210-cxx11-cu126-x86_64-linux/utils/generation.py +0 -390
build/torch210-cxx11-cu126-x86_64-linux/utils/hf.py +0 -23
build/torch210-cxx11-cu126-x86_64-linux/utils/torch.py +0 -21
build/torch210-cxx11-cu128-x86_64-linux/__init__.py +0 -14
build/torch210-cxx11-cu128-x86_64-linux/_mamba_ssm_b2a7fd5.abi3.so +0 -3
build/torch210-cxx11-cu128-x86_64-linux/_ops.py +0 -9
build/torch210-cxx11-cu128-x86_64-linux/distributed/__init__.py +0 -0
build/torch210-cxx11-cu128-x86_64-linux/distributed/distributed_utils.py +0 -144
build/torch210-cxx11-cu128-x86_64-linux/distributed/tensor_parallel.py +0 -296
build/torch210-cxx11-cu128-x86_64-linux/mamba_ssm/__init__.py +0 -26
build/torch210-cxx11-cu128-x86_64-linux/metadata.json +0 -1
build/torch210-cxx11-cu128-x86_64-linux/models/__init__.py +0 -0
build/torch210-cxx11-cu128-x86_64-linux/models/config_mamba.py +0 -18
build/torch210-cxx11-cu128-x86_64-linux/models/mixer_seq_simple.py +0 -309
build/torch210-cxx11-cu128-x86_64-linux/modules/__init__.py +0 -0
build/torch210-cxx11-cu128-x86_64-linux/modules/block.py +0 -107
build/torch210-cxx11-cu128-x86_64-linux/modules/mamba2.py +0 -502

build/torch210-cxx11-cu126-x86_64-linux/__init__.py DELETED Viewed

@@ -1,14 +0,0 @@
-__version__ = "2.2.4"
-from .ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
-from .modules.mamba_simple import Mamba
-from .modules.mamba2 import Mamba2
-from .models.mixer_seq_simple import MambaLMHeadModel
-__all__ = [
-    "selective_scan_fn",
-    "mamba_inner_fn",
-    "Mamba",
-    "Mamba2",
-    "MambaLMHeadModel",
-]

build/torch210-cxx11-cu126-x86_64-linux/_mamba_ssm_b2a7fd5.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:19b5ffd35a9fd55231325ac14270580c019395c0acb3e4e251518042b50b1aed
-size 444257200

build/torch210-cxx11-cu126-x86_64-linux/_ops.py DELETED Viewed

@@ -1,9 +0,0 @@
-import torch
-from . import _mamba_ssm_b2a7fd5
-ops = torch.ops._mamba_ssm_b2a7fd5
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_mamba_ssm_b2a7fd5::{op_name}"

build/torch210-cxx11-cu126-x86_64-linux/distributed/__init__.py DELETED Viewed

File without changes

build/torch210-cxx11-cu126-x86_64-linux/distributed/distributed_utils.py DELETED Viewed

@@ -1,144 +0,0 @@
-from typing import Optional
-import torch
-from torch import Tensor
-from torch.distributed import ProcessGroup
-# `all_gather_into_tensor` and `reduce_scatter_tensor` are new placeholders for
-# `_all_gather_base` and `_reduce_scatter_base`. They require the most recent
-# version of PyTorch. The following 4 lines are for backward compatibility with
-# older PyTorch.
-if "all_gather_into_tensor" not in dir(torch.distributed):
-    torch.distributed.all_gather_into_tensor = torch.distributed._all_gather_base
-if "reduce_scatter_tensor" not in dir(torch.distributed):
-    torch.distributed.reduce_scatter_tensor = torch.distributed._reduce_scatter_base
-# Raw operation, does not support autograd, but does support async
-def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
-    world_size = torch.distributed.get_world_size(process_group)
-    output = torch.empty(
-        world_size * input_.shape[0], *input_.shape[1:], dtype=input_.dtype, device=input_.device
-    )
-    handle = torch.distributed.all_gather_into_tensor(
-        output, input_.contiguous(), group=process_group, async_op=async_op
-    )
-    return output, handle
-# Raw operation, does not support autograd, but does support async
-def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
-    world_size = torch.distributed.get_world_size(process_group)
-    assert input_.shape[0] % world_size == 0
-    output = torch.empty(
-        input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device
-    )
-    handle = torch.distributed.reduce_scatter_tensor(
-        output, input_.contiguous(), group=process_group, async_op=async_op
-    )
-    return output, handle
-# Raw operation, does not support autograd, but does support async
-def all_reduce_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
-    input_ = input_.contiguous()
-    handle = torch.distributed.all_reduce(input_, group=process_group, async_op=async_op)
-    return input_, handle
-class AllGatherFunc(torch.autograd.Function):
-    """Gather the input from sequence parallel region and concatenate."""
-    @staticmethod
-    def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor:
-        ctx.process_group = process_group
-        output, _ = all_gather_raw(input_, process_group)
-        return output
-    @staticmethod
-    def backward(ctx, grad_output: Tensor):
-        grad_input, _ = reduce_scatter_raw(grad_output, ctx.process_group)
-        return grad_input, None
-# Supports autograd, but does not support async
-all_gather = AllGatherFunc.apply
-class ReduceScatterFunc(torch.autograd.Function):
-    """Reduce scatter the input from the sequence parallel region and concatenate."""
-    @staticmethod
-    def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor:
-        ctx.process_group = process_group
-        output, _ = reduce_scatter_raw(input_, process_group)
-        return output
-    @staticmethod
-    def backward(ctx, grad_output: Tensor):
-        grad_input, _ = all_gather_raw(grad_output, ctx.process_group)
-        return grad_input, None
-# Supports autograd, but does not support async
-reduce_scatter = ReduceScatterFunc.apply
-class AllReduceFunc(torch.autograd.Function):
-    """Gather the input from sequence parallel region and concatenate."""
-    @staticmethod
-    def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor:
-        ctx.process_group = process_group
-        output, _ = all_reduce_raw(input_, process_group)
-        return output
-    @staticmethod
-    def backward(ctx, grad_output: Tensor):
-        return grad_output, None
-# Supports autograd, but does not support async
-all_reduce = AllReduceFunc.apply
-def sync_shared_params(model: torch.nn.Module, process_group: ProcessGroup):
-    # We want to iterate over parameters with _shared_params=True in the same order,
-    # as different ranks might have different number of parameters (e.g., only rank 0 has bias).
-    pamams_shared = {
-        name: p for name, p in model.named_parameters() if getattr(p, "_shared_params", False)
-    }
-    for _, p in sorted(pamams_shared.items()):
-        with torch.no_grad():
-            # Broadcast needs src to be global rank, not group rank
-            torch.distributed.broadcast(
-                p, src=torch.distributed.get_global_rank(process_group, 0), group=process_group
-            )
-# Ref: https://github.com/NVIDIA/Megatron-LM/blob/52e636888cccc41e931251c417a7181fc36de926/megatron/optimizer/optimizer.py#L256
-def allreduce_sequence_parallel_grad(model: torch.nn.Module, process_group: ProcessGroup):
-    # We want to iterate over parameters with _sequence_parallel=True in the same order,
-    # as different ranks might have different number of parameters (e.g., only rank 0 has bias).
-    params_seqparallel = {
-        name: p for name, p in model.named_parameters() if getattr(p, "_sequence_parallel", False)
-    }
-    grads = [p.grad for _, p in sorted(params_seqparallel.items())]
-    if grads:
-        with torch.no_grad():
-            coalesced = torch._utils._flatten_dense_tensors(grads)
-            torch.distributed.all_reduce(coalesced, group=process_group)
-            for buf, synced in zip(grads, torch._utils._unflatten_dense_tensors(coalesced, grads)):
-                buf.copy_(synced)
-def get_dim_for_local_rank(dim: int, world_size: int, local_rank: int, multiple_of: int = 1) -> int:
-    """Get the dim for the local rank derived from splitting dim on world_size processes.
-    The split may not be even across the world_size processes.
-    """
-    multiple = dim // multiple_of
-    div = multiple // world_size
-    mod = multiple % world_size
-    local_multiple = div + int(local_rank < mod)
-    return local_multiple * multiple_of

build/torch210-cxx11-cu126-x86_64-linux/distributed/tensor_parallel.py DELETED Viewed

@@ -1,296 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# The TensorParallel linear modules are inspired by https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/layers.py
-from typing import Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from torch.distributed import ProcessGroup
-from ..utils.torch import custom_bwd, custom_fwd
-from einops import rearrange
-from ..distributed.distributed_utils import (
-    all_gather_raw,
-    all_reduce,
-    all_reduce_raw,
-    reduce_scatter,
-    reduce_scatter_raw,
-)
-class ParallelLinearFunc(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, x, weight, bias, process_group=None, sequence_parallel=True):
-        """
-        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
-        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
-        """
-        ctx.compute_weight_gradient = weight.requires_grad
-        ctx.process_group = process_group
-        ctx.sequence_parallel = sequence_parallel
-        if torch.is_autocast_enabled():
-            x = x.to(dtype=torch.get_autocast_gpu_dtype())
-        x = x.contiguous()
-        if process_group is not None and sequence_parallel:
-            # We want to kick off the all_gather early, before weight dtype conversion
-            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-        else:
-            total_x = x
-        if torch.is_autocast_enabled():
-            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
-            bias = bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
-        weight = weight.contiguous()
-        if process_group is not None and sequence_parallel:
-            handle_x.wait()
-        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
-        batch_dim = batch_shape.numel()
-        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
-        output = F.linear(total_x, weight, bias)
-        if ctx.compute_weight_gradient:
-            ctx.save_for_backward(x, weight)
-        else:
-            ctx.save_for_backward(weight)
-        return output
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-        grad_output = grad_output.contiguous()
-        process_group = ctx.process_group
-        sequence_parallel = ctx.sequence_parallel
-        if ctx.compute_weight_gradient:
-            x, weight = ctx.saved_tensors
-            if process_group is not None and sequence_parallel:
-                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-            else:
-                total_x = x
-        else:
-            (weight,) = ctx.saved_tensors
-            total_x = None
-        batch_shape = grad_output.shape[:-1]
-        batch_dim = batch_shape.numel()
-        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
-        if ctx.needs_input_grad[0]:
-            grad_input = F.linear(grad_output, weight.t())
-            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-            if process_group is not None:
-                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
-                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
-        else:
-            grad_input = None
-        if ctx.needs_input_grad[1]:
-            assert ctx.compute_weight_gradient
-            if process_group is not None and sequence_parallel:
-                handle_x.wait()
-            grad_weight = torch.einsum(
-                "bo,bi->oi", grad_output, total_x.reshape(batch_dim, total_x.shape[-1])
-            )
-        else:
-            grad_weight = None
-        grad_bias = grad_output.sum(dim=0) if ctx.needs_input_grad[2] else None
-        if process_group is not None and ctx.needs_input_grad[0]:
-            handle_grad_input.wait()
-        return grad_input, grad_weight, grad_bias, None, None
-def parallel_linear_func(
-    x: Tensor,
-    weight: Tensor,
-    bias: Optional[Tensor] = None,
-    process_group: Optional[ProcessGroup] = None,
-    sequence_parallel: bool = True,
-):
-    return ParallelLinearFunc.apply(x, weight, bias, process_group, sequence_parallel)
-class ColumnParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        if out_features % multiple_of:
-            raise ValueError(f"out_features ({out_features}) must be a multiple of {multiple_of}")
-        multiple = out_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        super().__init__(
-            in_features, local_multiple * multiple_of, bias=bias, device=device, dtype=dtype
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
-        # we do an all_gather of x before doing the matmul.
-        # If not, then the input is already gathered.
-        return parallel_linear_func(
-            x,
-            self.weight,
-            self.bias,
-            process_group=self.process_group,
-            sequence_parallel=self.sequence_parallel,
-        )
-class RowParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        rank = torch.distributed.get_rank(process_group)
-        if in_features % multiple_of:
-            raise ValueError(f"in_features ({in_features}) must be a multiple of {multiple_of}")
-        multiple = in_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        # Only rank 0 will have bias
-        super().__init__(
-            local_multiple * multiple_of,
-            out_features,
-            bias=bias and rank == 0,
-            device=device,
-            dtype=dtype,
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        """
-        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
-        a reduce_scatter of the result.
-        """
-        out = parallel_linear_func(x, self.weight, self.bias)
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return reduce_fn(out, self.process_group)
-class VocabParallelEmbedding(nn.Embedding):
-    def __init__(self, num_embeddings, *args, process_group=None, padding_idx=None, **kwargs):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if num_embeddings % world_size != 0:
-                raise ValueError(
-                    f"num_embeddings ({num_embeddings}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-            if world_size > 1 and padding_idx is not None:
-                raise RuntimeError("ParallelEmbedding does not support padding_idx")
-        else:
-            world_size = 1
-        super().__init__(num_embeddings // world_size, *args, padding_idx=padding_idx, **kwargs)
-    def forward(self, input: Tensor) -> Tensor:
-        if self.process_group is None:
-            return super().forward(input)
-        else:
-            rank = torch.distributed.get_rank(self.process_group)
-            vocab_size = self.num_embeddings
-            vocab_start_index, vocab_end_index = rank * vocab_size, (rank + 1) * vocab_size
-            # Create a mask of valid vocab ids (1 means it needs to be masked).
-            input_ids_mask = (input < vocab_start_index) | (input >= vocab_end_index)
-            input = input - vocab_start_index
-            input[input_ids_mask] = 0
-            embeddings = super().forward(input)
-            embeddings[input_ids_mask] = 0.0
-            return embeddings
-class ColumnParallelEmbedding(nn.Embedding):
-    def __init__(self, num_embeddings, embedding_dim, *args, process_group=None, **kwargs):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if embedding_dim % world_size != 0:
-                raise ValueError(
-                    f"embedding_dim ({embedding_dim}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-        else:
-            world_size = 1
-        super().__init__(num_embeddings, embedding_dim // world_size, *args, **kwargs)
-class ParallelEmbeddings(nn.Module):
-    def __init__(
-        self,
-        embed_dim,
-        vocab_size,
-        max_position_embeddings,
-        process_group,
-        padding_idx=None,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ):
-        """
-        If max_position_embeddings <= 0, there's no position embeddings
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-        self.word_embeddings = VocabParallelEmbedding(
-            vocab_size,
-            embed_dim,
-            padding_idx=padding_idx,
-            process_group=process_group,
-            **factory_kwargs,
-        )
-        self.max_position_embeddings = max_position_embeddings
-        if self.max_position_embeddings > 0:
-            self.position_embeddings = ColumnParallelEmbedding(
-                max_position_embeddings, embed_dim, process_group=process_group, **factory_kwargs
-            )
-    def forward(self, input_ids, position_ids=None, combine_batch_seqlen_dim=False):
-        """
-        input_ids: (batch, seqlen)
-        position_ids: (batch, seqlen)
-        """
-        batch_size, seqlen = input_ids.shape
-        world_size = torch.distributed.get_world_size(self.process_group)
-        embeddings = self.word_embeddings(input_ids)
-        if self.max_position_embeddings > 0:
-            if position_ids is None:
-                position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device)
-            position_embeddings = self.position_embeddings(position_ids)
-            if world_size <= 1:
-                embeddings = embeddings + position_embeddings
-            else:
-                partition_dim = self.position_embeddings.embedding_dim
-                rank = torch.distributed.get_rank(self.process_group)
-                embeddings[
-                    ..., rank * partition_dim : (rank + 1) * partition_dim
-                ] += position_embeddings
-        if combine_batch_seqlen_dim:
-            embeddings = rearrange(embeddings, "b s d -> (b s) d")
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return embeddings if world_size <= 1 else reduce_fn(embeddings, self.process_group)

build/torch210-cxx11-cu126-x86_64-linux/mamba_ssm/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu126-x86_64-linux/metadata.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"python-depends":[]}

build/torch210-cxx11-cu126-x86_64-linux/models/__init__.py DELETED Viewed

File without changes

build/torch210-cxx11-cu126-x86_64-linux/models/config_mamba.py DELETED Viewed

@@ -1,18 +0,0 @@
-from dataclasses import dataclass, field
-@dataclass
-class MambaConfig:
-    d_model: int = 2560
-    d_intermediate: int = 0
-    n_layer: int = 64
-    vocab_size: int = 50277
-    ssm_cfg: dict = field(default_factory=dict)
-    attn_layer_idx: list = field(default_factory=list)
-    attn_cfg: dict = field(default_factory=dict)
-    rms_norm: bool = True
-    residual_in_fp32: bool = True
-    fused_add_norm: bool = True
-    pad_vocab_size_multiple: int = 8
-    tie_embeddings: bool = True

build/torch210-cxx11-cu126-x86_64-linux/models/mixer_seq_simple.py DELETED Viewed

@@ -1,309 +0,0 @@
-# Copyright (c) 2023, Albert Gu, Tri Dao.
-import math
-from functools import partial
-import json
-import os
-import copy
-from collections import namedtuple
-import torch
-import torch.nn as nn
-from .config_mamba import MambaConfig
-from ..modules.mamba_simple import Mamba
-from ..modules.mamba2 import Mamba2
-from ..modules.mha import MHA
-from ..modules.mlp import GatedMLP
-from ..modules.block import Block
-from ..utils.generation import GenerationMixin
-from ..utils.hf import load_config_hf, load_state_dict_hf
-try:
-    from ..ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
-except ImportError:
-    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
-def create_block(
-    d_model,
-    d_intermediate,
-    ssm_cfg=None,
-    attn_layer_idx=None,
-    attn_cfg=None,
-    norm_epsilon=1e-5,
-    rms_norm=False,
-    residual_in_fp32=False,
-    fused_add_norm=False,
-    layer_idx=None,
-    device=None,
-    dtype=None,
-):
-    if ssm_cfg is None:
-        ssm_cfg = {}
-    if attn_layer_idx is None:
-        attn_layer_idx = []
-    if attn_cfg is None:
-        attn_cfg = {}
-    factory_kwargs = {"device": device, "dtype": dtype}
-    if layer_idx not in attn_layer_idx:
-        # Create a copy of the config to modify
-        ssm_cfg = copy.deepcopy(ssm_cfg) if ssm_cfg is not None else {}
-        ssm_layer = ssm_cfg.pop("layer", "Mamba1")
-        if ssm_layer not in ["Mamba1", "Mamba2"]:
-            raise ValueError(f"Invalid ssm_layer: {ssm_layer}, only support Mamba1 and Mamba2")
-        mixer_cls = partial(
-            Mamba2 if ssm_layer == "Mamba2" else Mamba,
-            layer_idx=layer_idx,
-            **ssm_cfg,
-            **factory_kwargs
-        )
-    else:
-        mixer_cls = partial(MHA, layer_idx=layer_idx, **attn_cfg, **factory_kwargs)
-    norm_cls = partial(
-        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
-    )
-    if d_intermediate == 0:
-        mlp_cls = nn.Identity
-    else:
-        mlp_cls = partial(
-            GatedMLP, hidden_features=d_intermediate, out_features=d_model, **factory_kwargs
-        )
-    block = Block(
-        d_model,
-        mixer_cls,
-        mlp_cls,
-        norm_cls=norm_cls,
-        fused_add_norm=fused_add_norm,
-        residual_in_fp32=residual_in_fp32,
-    )
-    block.layer_idx = layer_idx
-    return block
-# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
-def _init_weights(
-    module,
-    n_layer,
-    initializer_range=0.02,  # Now only used for embedding layer.
-    rescale_prenorm_residual=True,
-    n_residuals_per_layer=1,  # Change to 2 if we have MLP
-):
-    if isinstance(module, nn.Linear):
-        if module.bias is not None:
-            if not getattr(module.bias, "_no_reinit", False):
-                nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
-        nn.init.normal_(module.weight, std=initializer_range)
-    if rescale_prenorm_residual:
-        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-        #
-        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-        for name, p in module.named_parameters():
-            if name in ["out_proj.weight", "fc2.weight"]:
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                # We need to reinit p since this code could be called multiple times
-                # Having just p *= scale would repeatedly scale it down
-                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
-                with torch.no_grad():
-                    p /= math.sqrt(n_residuals_per_layer * n_layer)
-class MixerModel(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        n_layer: int,
-        d_intermediate: int,
-        vocab_size: int,
-        ssm_cfg=None,
-        attn_layer_idx=None,
-        attn_cfg=None,
-        norm_epsilon: float = 1e-5,
-        rms_norm: bool = False,
-        initializer_cfg=None,
-        fused_add_norm=False,
-        residual_in_fp32=False,
-        device=None,
-        dtype=None,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.residual_in_fp32 = residual_in_fp32
-        self.embedding = nn.Embedding(vocab_size, d_model, **factory_kwargs)
-        # We change the order of residual and layer norm:
-        # Instead of LN -> Attn / MLP -> Add, we do:
-        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
-        # the main branch (output of MLP / Mixer). The model definition is unchanged.
-        # This is for performance reason: we can fuse add + layer_norm.
-        self.fused_add_norm = fused_add_norm
-        if self.fused_add_norm:
-            if layer_norm_fn is None or rms_norm_fn is None:
-                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
-        self.layers = nn.ModuleList(
-            [
-                create_block(
-                    d_model,
-                    d_intermediate=d_intermediate,
-                    ssm_cfg=ssm_cfg,
-                    attn_layer_idx=attn_layer_idx,
-                    attn_cfg=attn_cfg,
-                    norm_epsilon=norm_epsilon,
-                    rms_norm=rms_norm,
-                    residual_in_fp32=residual_in_fp32,
-                    fused_add_norm=fused_add_norm,
-                    layer_idx=i,
-                    **factory_kwargs,
-                )
-                for i in range(n_layer)
-            ]
-        )
-        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
-            d_model, eps=norm_epsilon, **factory_kwargs
-        )
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-                n_residuals_per_layer=1 if d_intermediate == 0 else 2,  # 2 if we have MLP
-            )
-        )
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return {
-            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
-            for i, layer in enumerate(self.layers)
-        }
-    def forward(self, input_ids, inference_params=None, **mixer_kwargs):
-        hidden_states = self.embedding(input_ids)
-        residual = None
-        for layer in self.layers:
-            hidden_states, residual = layer(
-                hidden_states, residual, inference_params=inference_params, **mixer_kwargs
-            )
-        if not self.fused_add_norm:
-            residual = (hidden_states + residual) if residual is not None else hidden_states
-            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
-        else:
-            # Set prenorm=False here since we don't need the residual
-            hidden_states = layer_norm_fn(
-                hidden_states,
-                self.norm_f.weight,
-                self.norm_f.bias,
-                eps=self.norm_f.eps,
-                residual=residual,
-                prenorm=False,
-                residual_in_fp32=self.residual_in_fp32,
-                is_rms_norm=isinstance(self.norm_f, RMSNorm)
-            )
-        return hidden_states
-class MambaLMHeadModel(nn.Module, GenerationMixin):
-    def __init__(
-        self,
-        config: MambaConfig,
-        initializer_cfg=None,
-        device=None,
-        dtype=None,
-    ) -> None:
-        self.config = config
-        d_model = config.d_model
-        n_layer = config.n_layer
-        d_intermediate = config.d_intermediate
-        vocab_size = config.vocab_size
-        ssm_cfg = config.ssm_cfg
-        attn_layer_idx = config.attn_layer_idx
-        attn_cfg = config.attn_cfg
-        rms_norm = config.rms_norm
-        residual_in_fp32 = config.residual_in_fp32
-        fused_add_norm = config.fused_add_norm
-        pad_vocab_size_multiple = config.pad_vocab_size_multiple
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        if vocab_size % pad_vocab_size_multiple != 0:
-            vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
-        self.backbone = MixerModel(
-            d_model=d_model,
-            n_layer=n_layer,
-            d_intermediate=d_intermediate,
-            vocab_size=vocab_size,
-            ssm_cfg=ssm_cfg,
-            attn_layer_idx=attn_layer_idx,
-            attn_cfg=attn_cfg,
-            rms_norm=rms_norm,
-            initializer_cfg=initializer_cfg,
-            fused_add_norm=fused_add_norm,
-            residual_in_fp32=residual_in_fp32,
-            **factory_kwargs,
-        )
-        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
-        # Initialize weights and apply final processing
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-        self.tie_weights()
-    def tie_weights(self):
-        if self.config.tie_embeddings:
-            self.lm_head.weight = self.backbone.embedding.weight
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.backbone.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
-    def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0, **mixer_kwargs):
-        """
-        "position_ids" is just to be compatible with Transformer generation. We don't use it.
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        hidden_states = self.backbone(input_ids, inference_params=inference_params, **mixer_kwargs)
-        if num_last_tokens > 0:
-            hidden_states = hidden_states[:, -num_last_tokens:]
-        lm_logits = self.lm_head(hidden_states)
-        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
-        return CausalLMOutput(logits=lm_logits)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
-        config_data = load_config_hf(pretrained_model_name)
-        config = MambaConfig(**config_data)
-        model = cls(config, device=device, dtype=dtype, **kwargs)
-        model.load_state_dict(load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype))
-        return model
-    def save_pretrained(self, save_directory):
-        """
-        Minimal implementation of save_pretrained for MambaLMHeadModel.
-        Save the model and its configuration file to a directory.
-        """
-        # Ensure save_directory exists
-        os.makedirs(save_directory, exist_ok=True)
-        # Save the model's state_dict
-        model_path = os.path.join(save_directory, 'pytorch_model.bin')
-        torch.save(self.state_dict(), model_path)
-        # Save the configuration of the model
-        config_path = os.path.join(save_directory, 'config.json')
-        with open(config_path, 'w') as f:
-            json.dump(self.config.__dict__, f, indent=4)

build/torch210-cxx11-cu126-x86_64-linux/modules/__init__.py DELETED Viewed

File without changes

build/torch210-cxx11-cu126-x86_64-linux/modules/block.py DELETED Viewed

@@ -1,107 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-from typing import Optional
-import torch
-from torch import nn, Tensor
-from ..ops.triton.layer_norm import RMSNorm, layer_norm_fn
-class Block(nn.Module):
-    def __init__(
-        self,
-        dim,
-        mixer_cls,
-        mlp_cls,
-        norm_cls=nn.LayerNorm,
-        fused_add_norm=False,
-        residual_in_fp32=False,
-    ):
-        """
-        Simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection"
-        This Block has a slightly different structure compared to a regular
-        prenorm Transformer block.
-        The standard block is: LN -> MHA/MLP -> Add.
-        [Ref: https://arxiv.org/abs/2002.04745]
-        Here we have: Add -> LN -> Mixer, returning both
-        the hidden_states (output of the mixer) and the residual.
-        This is purely for performance reasons, as we can fuse add and LayerNorm.
-        The residual needs to be provided (except for the very first block).
-        """
-        super().__init__()
-        self.residual_in_fp32 = residual_in_fp32
-        self.fused_add_norm = fused_add_norm
-        self.norm = norm_cls(dim)
-        self.mixer = mixer_cls(dim)
-        if mlp_cls is not nn.Identity:
-            self.norm2 = norm_cls(dim)
-            self.mlp = mlp_cls(dim)
-        else:
-            self.mlp = None
-        if self.fused_add_norm:
-            assert RMSNorm is not None, "RMSNorm import fails"
-            assert isinstance(
-                self.norm, (nn.LayerNorm, RMSNorm)
-            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
-    def forward(
-        self,
-        hidden_states: Tensor,
-        residual: Optional[Tensor] = None,
-        inference_params=None,
-        **mixer_kwargs
-    ):
-        r"""Pass the input through the encoder layer.
-        Args:
-            hidden_states: the sequence to the encoder layer (required).
-            residual: hidden_states = Mixer(LN(residual))
-        """
-        if not self.fused_add_norm:
-            residual = (
-                (hidden_states + residual) if residual is not None else hidden_states
-            )
-            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
-            if self.residual_in_fp32:
-                residual = residual.to(torch.float32)
-        else:
-            hidden_states, residual = layer_norm_fn(
-                hidden_states,
-                self.norm.weight,
-                self.norm.bias,
-                residual=residual,
-                prenorm=True,
-                residual_in_fp32=self.residual_in_fp32,
-                eps=self.norm.eps,
-                is_rms_norm=isinstance(self.norm, RMSNorm),
-            )
-        hidden_states = self.mixer(
-            hidden_states, inference_params=inference_params, **mixer_kwargs
-        )
-        if self.mlp is not None:
-            if not self.fused_add_norm:
-                residual = hidden_states + residual
-                hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
-                if self.residual_in_fp32:
-                    residual = residual.to(torch.float32)
-            else:
-                hidden_states, residual = layer_norm_fn(
-                    hidden_states,
-                    self.norm2.weight,
-                    self.norm2.bias,
-                    residual=residual,
-                    prenorm=True,
-                    residual_in_fp32=self.residual_in_fp32,
-                    eps=self.norm2.eps,
-                    is_rms_norm=isinstance(self.norm2, RMSNorm),
-                )
-            hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.mixer.allocate_inference_cache(
-            batch_size, max_seqlen, dtype=dtype, **kwargs
-        )

build/torch210-cxx11-cu126-x86_64-linux/modules/mamba2.py DELETED Viewed

@@ -1,502 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
-except ImportError:
-    causal_conv1d_fn, causal_conv1d_update = None, None
-try:
-    from causal_conv1d.causal_conv1d_varlen import causal_conv1d_varlen_states
-except ImportError:
-    causal_conv1d_varlen_states = None
-try:
-    from ..ops.triton.selective_state_update import selective_state_update
-except ImportError:
-    selective_state_update = None
-from ..ops.triton.layernorm_gated import RMSNorm as RMSNormGated
-from ..distributed.tensor_parallel import ColumnParallelLinear, RowParallelLinear
-from ..distributed.distributed_utils import all_reduce, reduce_scatter
-from ..ops.triton.ssd_combined import mamba_chunk_scan_combined
-from ..ops.triton.ssd_combined import mamba_split_conv1d_scan_combined
-from huggingface_hub import PyTorchModelHubMixin
-class Mamba2(nn.Module, PyTorchModelHubMixin):
-    def __init__(
-        self,
-        d_model,
-        d_state=128,
-        d_conv=4,
-        conv_init=None,
-        expand=2,
-        headdim=64,
-        d_ssm=None,  # If not None, we only apply SSM on this many dimensions, the rest uses gated MLP
-        ngroups=1,
-        A_init_range=(1, 16),
-        D_has_hdim=False,
-        rmsnorm=True,
-        norm_before_gate=False,
-        dt_min=0.001,
-        dt_max=0.1,
-        dt_init_floor=1e-4,
-        dt_limit=(0.0, float("inf")),
-        bias=False,
-        conv_bias=True,
-        # Fused kernel and sharding options
-        chunk_size=256,
-        use_mem_eff_path=True,
-        layer_idx=None,  # Absorb kwarg for general module
-        process_group=None,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.d_model = d_model
-        self.d_state = d_state
-        self.d_conv = d_conv
-        self.conv_init = conv_init
-        self.expand = expand
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-        self.world_size = 1 if process_group is None else process_group.size()
-        self.local_rank = 0 if process_group is None else process_group.rank()
-        self.d_inner = (self.expand * self.d_model) // self.world_size
-        assert self.d_inner * self.world_size == self.expand * self.d_model
-        self.headdim = headdim
-        self.d_ssm = self.d_inner if d_ssm is None else d_ssm // self.world_size
-        assert ngroups % self.world_size == 0
-        self.ngroups = ngroups // self.world_size
-        assert self.d_ssm % self.headdim == 0
-        self.nheads = self.d_ssm // self.headdim
-        self.D_has_hdim = D_has_hdim
-        self.rmsnorm = rmsnorm
-        self.norm_before_gate = norm_before_gate
-        self.dt_limit = dt_limit
-        self.activation = "silu"
-        self.chunk_size = chunk_size
-        self.use_mem_eff_path = use_mem_eff_path
-        self.layer_idx = layer_idx
-        # Order: [z, x, B, C, dt]
-        d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
-        if self.process_group is None:
-            self.in_proj = nn.Linear(
-                self.d_model, d_in_proj, bias=bias, **factory_kwargs
-            )
-        else:
-            self.in_proj = ColumnParallelLinear(
-                self.d_model,
-                d_in_proj * self.world_size,
-                bias=bias,
-                process_group=self.process_group,
-                sequence_parallel=self.sequence_parallel,
-                **factory_kwargs,
-            )
-        conv_dim = self.d_ssm + 2 * self.ngroups * self.d_state
-        self.conv1d = nn.Conv1d(
-            in_channels=conv_dim,
-            out_channels=conv_dim,
-            bias=conv_bias,
-            kernel_size=d_conv,
-            groups=conv_dim,
-            padding=d_conv - 1,
-            **factory_kwargs,
-        )
-        if self.conv_init is not None:
-            nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
-        self.act = nn.SiLU()
-        # Initialize log dt bias
-        dt = torch.exp(
-            torch.rand(self.nheads, **factory_kwargs)
-            * (math.log(dt_max) - math.log(dt_min))
-            + math.log(dt_min)
-        )
-        dt = torch.clamp(dt, min=dt_init_floor)
-        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-        inv_dt = dt + torch.log(-torch.expm1(-dt))
-        self.dt_bias = nn.Parameter(inv_dt)
-        # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
-        # name.endswith("bias") in param_grouping.py
-        self.dt_bias._no_weight_decay = True
-        assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
-        A = torch.empty(self.nheads, dtype=torch.float32, device=device).uniform_(
-            *A_init_range
-        )
-        A_log = torch.log(A).to(dtype=dtype)
-        self.A_log = nn.Parameter(A_log)
-        self.A_log._no_weight_decay = True
-        # D "skip" parameter
-        self.D = nn.Parameter(
-            torch.ones(self.d_ssm if self.D_has_hdim else self.nheads, device=device)
-        )
-        self.D._no_weight_decay = True
-        if self.rmsnorm:
-            assert RMSNormGated is not None
-            self.norm = RMSNormGated(
-                self.d_ssm,
-                eps=1e-5,
-                norm_before_gate=self.norm_before_gate,
-                group_size=self.d_ssm // ngroups,
-                **factory_kwargs,
-            )
-        if self.process_group is None:
-            self.out_proj = nn.Linear(
-                self.d_inner, self.d_model, bias=bias, **factory_kwargs
-            )
-        else:
-            self.out_proj = RowParallelLinear(
-                self.d_inner * self.world_size,
-                self.d_model,
-                bias=bias,
-                process_group=self.process_group,
-                sequence_parallel=self.sequence_parallel,
-                **factory_kwargs,
-            )
-    def forward(
-        self, u, seqlen=None, seq_idx=None, cu_seqlens=None, inference_params=None
-    ):
-        """
-        u: (batch, seqlen, hidden_dim) if seqlen=None.
-            If seqlen is not None, u is (batch * seqlen, hidden_dim). This is so that when we
-            split u during sequence parallel, we split the batch * seqlen dimension
-            (in case batch is small).
-        Returns: same shape as u
-        """
-        seqlen_og = seqlen
-        if seqlen is None:
-            batch, seqlen, dim = u.shape
-        else:
-            batch_seqlen, dim = u.shape
-            batch = batch_seqlen // seqlen
-        conv_state, ssm_state = None, None
-        if inference_params is not None:
-            inference_batch = (
-                cu_seqlens.shape[0] - 1 if cu_seqlens is not None else batch
-            )
-            conv_state, ssm_state = self._get_states_from_cache(
-                inference_params, inference_batch
-            )
-            if inference_params.seqlen_offset > 0:
-                # The states are updated inplace
-                out, _, _ = self.step(u, conv_state, ssm_state)
-                return out
-        zxbcdt = self.in_proj(u)  # (B, L, d_in_proj) or (B * L, d_in_proj)
-        if seqlen_og is not None:
-            zxbcdt = rearrange(zxbcdt, "(b l) d -> b l d", l=seqlen)
-        # If the model is loaded in fp16, without the .float() here, A might be -inf
-        A = -torch.exp(self.A_log.float())  # (nheads) or (d_inner, d_state)
-        dt_limit_kwargs = (
-            {} if self.dt_limit == (0.0, float("inf")) else dict(dt_limit=self.dt_limit)
-        )
-        if self.use_mem_eff_path and inference_params is None:
-            out = mamba_split_conv1d_scan_combined(
-                zxbcdt,
-                rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                self.conv1d.bias,
-                self.dt_bias,
-                A,
-                D=(
-                    rearrange(self.D, "(h p) -> h p", p=self.headdim)
-                    if self.D_has_hdim
-                    else self.D
-                ),
-                chunk_size=self.chunk_size,
-                seq_idx=seq_idx,
-                activation=self.activation,
-                rmsnorm_weight=self.norm.weight if self.rmsnorm else None,
-                rmsnorm_eps=self.norm.eps if self.rmsnorm else 1e-6,
-                outproj_weight=self.out_proj.weight,
-                outproj_bias=self.out_proj.bias,
-                headdim=None if self.D_has_hdim else self.headdim,
-                ngroups=self.ngroups,
-                norm_before_gate=self.norm_before_gate,
-                **dt_limit_kwargs,
-            )
-            if seqlen_og is not None:
-                out = rearrange(out, "b l d -> (b l) d")
-            if self.process_group is not None:
-                reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-                out = reduce_fn(out, self.process_group)
-        else:
-            d_mlp = (
-                zxbcdt.shape[-1]
-                - 2 * self.d_ssm
-                - 2 * self.ngroups * self.d_state
-                - self.nheads
-            ) // 2
-            z0, x0, z, xBC, dt = torch.split(
-                zxbcdt,
-                [
-                    d_mlp,
-                    d_mlp,
-                    self.d_ssm,
-                    self.d_ssm + 2 * self.ngroups * self.d_state,
-                    self.nheads,
-                ],
-                dim=-1,
-            )
-            if conv_state is not None:
-                if cu_seqlens is None:
-                    # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
-                    # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
-                    xBC_t = rearrange(xBC, "b l d -> b d l")
-                    conv_state.copy_(
-                        F.pad(xBC_t, (self.d_conv - xBC_t.shape[-1], 0))
-                    )  # Update state (B D W)
-                else:
-                    assert (
-                        causal_conv1d_varlen_states is not None
-                    ), "varlen inference requires causal_conv1d package"
-                    assert (
-                        batch == 1
-                    ), "varlen inference only supports batch dimension 1"
-                    conv_varlen_states = causal_conv1d_varlen_states(
-                        xBC.squeeze(0), cu_seqlens, state_len=conv_state.shape[-1]
-                    )
-                    conv_state.copy_(conv_varlen_states)
-            assert self.activation in ["silu", "swish"]
-            if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
-                assert (
-                    seq_idx is None
-                ), "varlen conv1d requires the causal_conv1d package"
-                xBC = self.act(
-                    self.conv1d(xBC.transpose(1, 2)).transpose(1, 2)[
-                        :, : -(self.d_conv - 1)
-                    ]
-                )  # (B, L, self.d_ssm + 2 * ngroups * d_state)
-            else:
-                xBC = causal_conv1d_fn(
-                    xBC.transpose(1, 2),
-                    rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                    bias=self.conv1d.bias,
-                    activation=self.activation,
-                    seq_idx=seq_idx,
-                ).transpose(1, 2)
-            x, B, C = torch.split(
-                xBC,
-                [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state],
-                dim=-1,
-            )
-            y = mamba_chunk_scan_combined(
-                rearrange(x, "b l (h p) -> b l h p", p=self.headdim),
-                dt,
-                A,
-                rearrange(B, "b l (g n) -> b l g n", g=self.ngroups),
-                rearrange(C, "b l (g n) -> b l g n", g=self.ngroups),
-                chunk_size=self.chunk_size,
-                D=(
-                    rearrange(self.D, "(h p) -> h p", p=self.headdim)
-                    if self.D_has_hdim
-                    else self.D
-                ),
-                z=(
-                    rearrange(z, "b l (h p) -> b l h p", p=self.headdim)
-                    if not self.rmsnorm
-                    else None
-                ),
-                dt_bias=self.dt_bias,
-                dt_softplus=True,
-                seq_idx=seq_idx,
-                cu_seqlens=cu_seqlens,
-                **dt_limit_kwargs,
-                return_final_states=ssm_state is not None,
-                return_varlen_states=cu_seqlens is not None
-                and inference_params is not None,
-            )
-            if ssm_state is not None:
-                y, last_state, *rest = y
-                if cu_seqlens is None:
-                    ssm_state.copy_(last_state)
-                else:
-                    varlen_states = rest[0]
-                    ssm_state.copy_(varlen_states)
-            y = rearrange(y, "b l h p -> b l (h p)")
-            if self.rmsnorm:
-                y = self.norm(y, z)
-            if d_mlp > 0:
-                y = torch.cat([F.silu(z0) * x0, y], dim=-1)
-            if seqlen_og is not None:
-                y = rearrange(y, "b l d -> (b l) d")
-            out = self.out_proj(y)
-        return out
-    def step(self, hidden_states, conv_state, ssm_state):
-        dtype = hidden_states.dtype
-        assert (
-            hidden_states.shape[1] == 1
-        ), "Only support decoding with 1 token at a time for now"
-        zxbcdt = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
-        d_mlp = (
-            zxbcdt.shape[-1]
-            - 2 * self.d_ssm
-            - 2 * self.ngroups * self.d_state
-            - self.nheads
-        ) // 2
-        z0, x0, z, xBC, dt = torch.split(
-            zxbcdt,
-            [
-                d_mlp,
-                d_mlp,
-                self.d_ssm,
-                self.d_ssm + 2 * self.ngroups * self.d_state,
-                self.nheads,
-            ],
-            dim=-1,
-        )
-        # Conv step
-        if causal_conv1d_update is None:
-            conv_state.copy_(
-                torch.roll(conv_state, shifts=-1, dims=-1)
-            )  # Update state (B D W)
-            conv_state[:, :, -1] = xBC
-            xBC = torch.sum(
-                conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1
-            )  # (B D)
-            if self.conv1d.bias is not None:
-                xBC = xBC + self.conv1d.bias
-            xBC = self.act(xBC).to(dtype=dtype)
-        else:
-            xBC = causal_conv1d_update(
-                xBC,
-                conv_state,
-                rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                self.conv1d.bias,
-                self.activation,
-            )
-        x, B, C = torch.split(
-            xBC,
-            [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state],
-            dim=-1,
-        )
-        A = -torch.exp(self.A_log.float())  # (nheads,)
-        # SSM step
-        if selective_state_update is None:
-            assert (
-                self.ngroups == 1
-            ), "Only support ngroups=1 for this inference code path"
-            # Discretize A and B
-            dt = F.softplus(dt + self.dt_bias.to(dtype=dt.dtype))  # (batch, nheads)
-            dA = torch.exp(dt * A)  # (batch, nheads)
-            x = rearrange(x, "b (h p) -> b h p", p=self.headdim)
-            dBx = torch.einsum("bh,bn,bhp->bhpn", dt, B, x)
-            ssm_state.copy_(ssm_state * rearrange(dA, "b h -> b h 1 1") + dBx)
-            y = torch.einsum("bhpn,bn->bhp", ssm_state.to(dtype), C)
-            y = y + rearrange(self.D.to(dtype), "h -> h 1") * x
-            y = rearrange(y, "b h p -> b (h p)")
-            if not self.rmsnorm:
-                y = y * self.act(z)  # (B D)
-        else:
-            A = repeat(A, "h -> h p n", p=self.headdim, n=self.d_state).to(
-                dtype=torch.float32
-            )
-            dt = repeat(dt, "b h -> b h p", p=self.headdim)
-            dt_bias = repeat(self.dt_bias, "h -> h p", p=self.headdim)
-            D = repeat(self.D, "h -> h p", p=self.headdim)
-            B = rearrange(B, "b (g n) -> b g n", g=self.ngroups)
-            C = rearrange(C, "b (g n) -> b g n", g=self.ngroups)
-            x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim)
-            if not self.rmsnorm:
-                z = rearrange(z, "b (h p) -> b h p", p=self.headdim)
-            y = selective_state_update(
-                ssm_state,
-                x_reshaped,
-                dt,
-                A,
-                B,
-                C,
-                D,
-                z=z if not self.rmsnorm else None,
-                dt_bias=dt_bias,
-                dt_softplus=True,
-            )
-            y = rearrange(y, "b h p -> b (h p)")
-        if self.rmsnorm:
-            y = self.norm(y, z)
-        if d_mlp > 0:
-            y = torch.cat([F.silu(z0) * x0, y], dim=-1)
-        out = self.out_proj(y)
-        return out.unsqueeze(1), conv_state, ssm_state
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        device = self.out_proj.weight.device
-        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
-        conv_state = torch.zeros(
-            batch_size,
-            self.d_conv,
-            self.conv1d.weight.shape[0],
-            device=device,
-            dtype=conv_dtype,
-        ).transpose(1, 2)
-        ssm_dtype = self.in_proj.weight.dtype if dtype is None else dtype
-        ssm_state = torch.zeros(
-            batch_size,
-            self.nheads,
-            self.headdim,
-            self.d_state,
-            device=device,
-            dtype=ssm_dtype,
-        )
-        return conv_state, ssm_state
-    def _get_states_from_cache(
-        self, inference_params, batch_size, initialize_states=False
-    ):
-        assert self.layer_idx is not None
-        if self.layer_idx not in inference_params.key_value_memory_dict:
-            batch_shape = (batch_size,)
-            conv_state = torch.zeros(
-                batch_size,
-                self.d_conv,
-                self.conv1d.weight.shape[0],
-                device=self.conv1d.weight.device,
-                dtype=self.conv1d.weight.dtype,
-            ).transpose(1, 2)
-            ssm_state = torch.zeros(
-                batch_size,
-                self.nheads,
-                self.headdim,
-                self.d_state,
-                device=self.in_proj.weight.device,
-                dtype=self.in_proj.weight.dtype,
-            )
-            inference_params.key_value_memory_dict[self.layer_idx] = (
-                conv_state,
-                ssm_state,
-            )
-        else:
-            conv_state, ssm_state = inference_params.key_value_memory_dict[
-                self.layer_idx
-            ]
-            # TODO: What if batch size changes between generation, and we reuse the same states?
-            if initialize_states:
-                conv_state.zero_()
-                ssm_state.zero_()
-        return conv_state, ssm_state

build/torch210-cxx11-cu126-x86_64-linux/modules/mamba2_simple.py DELETED Viewed

@@ -1,229 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn
-except ImportError:
-    causal_conv1d_fn = None
-try:
-    from ..ops.triton.layernorm_gated import RMSNorm as RMSNormGated, LayerNorm
-except ImportError:
-    RMSNormGated, LayerNorm = None, None
-from ..ops.triton.ssd_combined import mamba_chunk_scan_combined
-from ..ops.triton.ssd_combined import mamba_split_conv1d_scan_combined
-class Mamba2Simple(nn.Module):
-    def __init__(
-        self,
-        d_model,
-        d_state=64,
-        d_conv=4,
-        conv_init=None,
-        expand=2,
-        headdim=128,
-        ngroups=1,
-        A_init_range=(1, 16),
-        dt_min=0.001,
-        dt_max=0.1,
-        dt_init_floor=1e-4,
-        dt_limit=(0.0, float("inf")),
-        learnable_init_states=False,
-        activation="swish",
-        bias=False,
-        conv_bias=True,
-        # Fused kernel and sharding options
-        chunk_size=256,
-        use_mem_eff_path=True,
-        layer_idx=None,  # Absorb kwarg for general module
-        device=None,
-        dtype=None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.d_model = d_model
-        self.d_state = d_state
-        self.d_conv = d_conv
-        self.conv_init = conv_init
-        self.expand = expand
-        self.d_inner = self.expand * self.d_model
-        self.headdim = headdim
-        self.ngroups = ngroups
-        assert self.d_inner % self.headdim == 0
-        self.nheads = self.d_inner // self.headdim
-        self.dt_limit = dt_limit
-        self.learnable_init_states = learnable_init_states
-        self.activation = activation
-        self.chunk_size = chunk_size
-        self.use_mem_eff_path = use_mem_eff_path
-        self.layer_idx = layer_idx
-        # Order: [z, x, B, C, dt]
-        d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
-        self.in_proj = nn.Linear(self.d_model, d_in_proj, bias=bias, **factory_kwargs)
-        conv_dim = self.d_inner + 2 * self.ngroups * self.d_state
-        self.conv1d = nn.Conv1d(
-            in_channels=conv_dim,
-            out_channels=conv_dim,
-            bias=conv_bias,
-            kernel_size=d_conv,
-            groups=conv_dim,
-            padding=d_conv - 1,
-            **factory_kwargs,
-        )
-        if self.conv_init is not None:
-            nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
-        # self.conv1d.weight._no_weight_decay = True
-        if self.learnable_init_states:
-            self.init_states = nn.Parameter(
-                torch.zeros(self.nheads, self.headdim, self.d_state, **factory_kwargs)
-            )
-            self.init_states._no_weight_decay = True
-        self.act = nn.SiLU()
-        # Initialize log dt bias
-        dt = torch.exp(
-            torch.rand(self.nheads, **factory_kwargs)
-            * (math.log(dt_max) - math.log(dt_min))
-            + math.log(dt_min)
-        )
-        dt = torch.clamp(dt, min=dt_init_floor)
-        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-        inv_dt = dt + torch.log(-torch.expm1(-dt))
-        self.dt_bias = nn.Parameter(inv_dt)
-        # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
-        # name.endswith("bias") in param_grouping.py
-        self.dt_bias._no_weight_decay = True
-        # A parameter
-        assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
-        A = torch.empty(self.nheads, dtype=torch.float32, device=device).uniform_(
-            *A_init_range
-        )
-        A_log = torch.log(A).to(dtype=dtype)
-        self.A_log = nn.Parameter(A_log)
-        # self.register_buffer("A_log", torch.zeros(self.nheads, dtype=torch.float32, device=device), persistent=True)
-        self.A_log._no_weight_decay = True
-        # D "skip" parameter
-        self.D = nn.Parameter(torch.ones(self.nheads, device=device))
-        self.D._no_weight_decay = True
-        # Extra normalization layer right before output projection
-        assert RMSNormGated is not None
-        self.norm = RMSNormGated(
-            self.d_inner, eps=1e-5, norm_before_gate=False, **factory_kwargs
-        )
-        self.out_proj = nn.Linear(
-            self.d_inner, self.d_model, bias=bias, **factory_kwargs
-        )
-    def forward(self, u, seq_idx=None):
-        """
-        u: (B, L, D)
-        Returns: same shape as u
-        """
-        batch, seqlen, dim = u.shape
-        zxbcdt = self.in_proj(u)  # (B, L, d_in_proj)
-        A = -torch.exp(self.A_log)  # (nheads) or (d_inner, d_state)
-        initial_states = (
-            repeat(self.init_states, "... -> b ...", b=batch)
-            if self.learnable_init_states
-            else None
-        )
-        dt_limit_kwargs = (
-            {} if self.dt_limit == (0.0, float("inf")) else dict(dt_limit=self.dt_limit)
-        )
-        if self.use_mem_eff_path:
-            # Fully fused path
-            out = mamba_split_conv1d_scan_combined(
-                zxbcdt,
-                rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                self.conv1d.bias,
-                self.dt_bias,
-                A,
-                D=self.D,
-                chunk_size=self.chunk_size,
-                seq_idx=seq_idx,
-                activation=self.activation,
-                rmsnorm_weight=self.norm.weight,
-                rmsnorm_eps=self.norm.eps,
-                outproj_weight=self.out_proj.weight,
-                outproj_bias=self.out_proj.bias,
-                headdim=self.headdim,
-                ngroups=self.ngroups,
-                norm_before_gate=False,
-                initial_states=initial_states,
-                **dt_limit_kwargs,
-            )
-        else:
-            z, xBC, dt = torch.split(
-                zxbcdt,
-                [
-                    self.d_inner,
-                    self.d_inner + 2 * self.ngroups * self.d_state,
-                    self.nheads,
-                ],
-                dim=-1,
-            )
-            dt = F.softplus(dt + self.dt_bias)  # (B, L, nheads)
-            assert self.activation in ["silu", "swish"]
-            # 1D Convolution
-            if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
-                xBC = self.act(
-                    self.conv1d(xBC.transpose(1, 2)).transpose(1, 2)
-                )  # (B, L, self.d_inner + 2 * ngroups * d_state)
-                xBC = xBC[:, :seqlen, :]
-            else:
-                xBC = causal_conv1d_fn(
-                    x=xBC.transpose(1, 2),
-                    weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                    bias=self.conv1d.bias,
-                    activation=self.activation,
-                ).transpose(1, 2)
-            # Split into 3 main branches: X, B, C
-            # These correspond to V, K, Q respectively in the SSM/attention duality
-            x, B, C = torch.split(
-                xBC,
-                [
-                    self.d_inner,
-                    self.ngroups * self.d_state,
-                    self.ngroups * self.d_state,
-                ],
-                dim=-1,
-            )
-            y = mamba_chunk_scan_combined(
-                rearrange(x, "b l (h p) -> b l h p", p=self.headdim),
-                dt,
-                A,
-                rearrange(B, "b l (g n) -> b l g n", g=self.ngroups),
-                rearrange(C, "b l (g n) -> b l g n", g=self.ngroups),
-                chunk_size=self.chunk_size,
-                D=self.D,
-                z=None,
-                seq_idx=seq_idx,
-                initial_states=initial_states,
-                **dt_limit_kwargs,
-            )
-            y = rearrange(y, "b l h p -> b l (h p)")
-            # Multiply "gate" branch and apply extra normalization layer
-            y = self.norm(y, z)
-            out = self.out_proj(y)
-        return out

build/torch210-cxx11-cu126-x86_64-linux/modules/mamba_simple.py DELETED Viewed

@@ -1,339 +0,0 @@
-# Copyright (c) 2023, Tri Dao, Albert Gu.
-import math
-from typing import Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from einops import rearrange, repeat
-from ..ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
-try:
-    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
-except ImportError:
-    causal_conv1d_fn, causal_conv1d_update = None, None
-try:
-    from ..ops.triton.selective_state_update import selective_state_update
-except ImportError:
-    selective_state_update = None
-try:
-    from ..ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
-except ImportError:
-    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
-class Mamba(nn.Module):
-    def __init__(
-        self,
-        d_model,
-        d_state=16,
-        d_conv=4,
-        expand=2,
-        dt_rank="auto",
-        dt_min=0.001,
-        dt_max=0.1,
-        dt_init="random",
-        dt_scale=1.0,
-        dt_init_floor=1e-4,
-        conv_bias=True,
-        bias=False,
-        use_fast_path=True,  # Fused kernel options
-        layer_idx=None,
-        device=None,
-        dtype=None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.d_model = d_model
-        self.d_state = d_state
-        self.d_conv = d_conv
-        self.expand = expand
-        self.d_inner = int(self.expand * self.d_model)
-        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
-        self.use_fast_path = use_fast_path
-        self.layer_idx = layer_idx
-        self.in_proj = nn.Linear(
-            self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs
-        )
-        self.conv1d = nn.Conv1d(
-            in_channels=self.d_inner,
-            out_channels=self.d_inner,
-            bias=conv_bias,
-            kernel_size=d_conv,
-            groups=self.d_inner,
-            padding=d_conv - 1,
-            **factory_kwargs,
-        )
-        self.activation = "silu"
-        self.act = nn.SiLU()
-        self.x_proj = nn.Linear(
-            self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
-        )
-        self.dt_proj = nn.Linear(
-            self.dt_rank, self.d_inner, bias=True, **factory_kwargs
-        )
-        # Initialize special dt projection to preserve variance at initialization
-        dt_init_std = self.dt_rank**-0.5 * dt_scale
-        if dt_init == "constant":
-            nn.init.constant_(self.dt_proj.weight, dt_init_std)
-        elif dt_init == "random":
-            nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
-        else:
-            raise NotImplementedError
-        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
-        dt = torch.exp(
-            torch.rand(self.d_inner, **factory_kwargs)
-            * (math.log(dt_max) - math.log(dt_min))
-            + math.log(dt_min)
-        ).clamp(min=dt_init_floor)
-        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-        inv_dt = dt + torch.log(-torch.expm1(-dt))
-        with torch.no_grad():
-            self.dt_proj.bias.copy_(inv_dt)
-        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
-        self.dt_proj.bias._no_reinit = True
-        # S4D real initialization
-        A = repeat(
-            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
-            "n -> d n",
-            d=self.d_inner,
-        ).contiguous()
-        A_log = torch.log(A)  # Keep A_log in fp32
-        self.A_log = nn.Parameter(A_log)
-        self.A_log._no_weight_decay = True
-        # D "skip" parameter
-        self.D = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
-        self.D._no_weight_decay = True
-        self.out_proj = nn.Linear(
-            self.d_inner, self.d_model, bias=bias, **factory_kwargs
-        )
-    def forward(self, hidden_states, inference_params=None):
-        """
-        hidden_states: (B, L, D)
-        Returns: same shape as hidden_states
-        """
-        batch, seqlen, dim = hidden_states.shape
-        conv_state, ssm_state = None, None
-        if inference_params is not None:
-            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
-            if inference_params.seqlen_offset > 0:
-                # The states are updated inplace
-                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
-                return out
-        # We do matmul and transpose BLH -> HBL at the same time
-        xz = rearrange(
-            self.in_proj.weight @ rearrange(hidden_states, "b l d -> d (b l)"),
-            "d (b l) -> b d l",
-            l=seqlen,
-        )
-        if self.in_proj.bias is not None:
-            xz = xz + rearrange(self.in_proj.bias.to(dtype=xz.dtype), "d -> d 1")
-        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
-        # In the backward pass we write dx and dz next to each other to avoid torch.cat
-        if (
-            self.use_fast_path
-            and causal_conv1d_fn is not None
-            and inference_params is None
-        ):  # Doesn't support outputting the states
-            out = mamba_inner_fn(
-                xz,
-                self.conv1d.weight,
-                self.conv1d.bias,
-                self.x_proj.weight,
-                self.dt_proj.weight,
-                self.out_proj.weight,
-                self.out_proj.bias,
-                A,
-                None,  # input-dependent B
-                None,  # input-dependent C
-                self.D.float(),
-                delta_bias=self.dt_proj.bias.float(),
-                delta_softplus=True,
-            )
-        else:
-            x, z = xz.chunk(2, dim=1)
-            # Compute short convolution
-            if conv_state is not None:
-                # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
-                # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
-                conv_state.copy_(
-                    F.pad(x, (self.d_conv - x.shape[-1], 0))
-                )  # Update state (B D W)
-            if causal_conv1d_fn is None:
-                x = self.act(self.conv1d(x)[..., :seqlen])
-            else:
-                assert self.activation in ["silu", "swish"]
-                x = causal_conv1d_fn(
-                    x=x,
-                    weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                    bias=self.conv1d.bias,
-                    activation=self.activation,
-                )
-            # We're careful here about the layout, to avoid extra transposes.
-            # We want dt to have d as the slowest moving dimension
-            # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
-            x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
-            dt, B, C = torch.split(
-                x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1
-            )
-            dt = self.dt_proj.weight @ dt.t()
-            dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
-            B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
-            C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
-            assert self.activation in ["silu", "swish"]
-            y = selective_scan_fn(
-                x,
-                dt,
-                A,
-                B,
-                C,
-                self.D.float(),
-                z=z,
-                delta_bias=self.dt_proj.bias.float(),
-                delta_softplus=True,
-                return_last_state=ssm_state is not None,
-            )
-            if ssm_state is not None:
-                y, last_state = y
-                ssm_state.copy_(last_state)
-            y = rearrange(y, "b d l -> b l d")
-            out = self.out_proj(y)
-        return out
-    def step(self, hidden_states, conv_state, ssm_state):
-        dtype = hidden_states.dtype
-        assert (
-            hidden_states.shape[1] == 1
-        ), "Only support decoding with 1 token at a time for now"
-        xz = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
-        x, z = xz.chunk(2, dim=-1)  # (B D)
-        # Conv step
-        if causal_conv1d_update is None:
-            conv_state.copy_(
-                torch.roll(conv_state, shifts=-1, dims=-1)
-            )  # Update state (B D W)
-            conv_state[:, :, -1] = x
-            x = torch.sum(
-                conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1
-            )  # (B D)
-            if self.conv1d.bias is not None:
-                x = x + self.conv1d.bias
-            x = self.act(x).to(dtype=dtype)
-        else:
-            x = causal_conv1d_update(
-                x,
-                conv_state,
-                rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                self.conv1d.bias,
-                self.activation,
-            )
-        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
-        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
-        # Don't add dt_bias here
-        dt = F.linear(dt, self.dt_proj.weight)  # (B d_inner)
-        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
-        # SSM step
-        if selective_state_update is None:
-            # Discretize A and B
-            dt = F.softplus(dt + self.dt_proj.bias.to(dtype=dt.dtype))
-            dA = torch.exp(torch.einsum("bd,dn->bdn", dt, A))
-            dB = torch.einsum("bd,bn->bdn", dt, B)
-            ssm_state.copy_(ssm_state * dA + rearrange(x, "b d -> b d 1") * dB)
-            y = torch.einsum("bdn,bn->bd", ssm_state.to(dtype), C)
-            y = y + self.D.to(dtype) * x
-            y = y * self.act(z)  # (B D)
-        else:
-            y = selective_state_update(
-                ssm_state,
-                x,
-                dt,
-                A,
-                B,
-                C,
-                self.D,
-                z=z,
-                dt_bias=self.dt_proj.bias,
-                dt_softplus=True,
-            )
-        out = self.out_proj(y)
-        return out.unsqueeze(1), conv_state, ssm_state
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        device = self.out_proj.weight.device
-        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
-        conv_state = torch.zeros(
-            batch_size,
-            self.d_model * self.expand,
-            self.d_conv,
-            device=device,
-            dtype=conv_dtype,
-        )
-        ssm_dtype = self.dt_proj.weight.dtype if dtype is None else dtype
-        # ssm_dtype = torch.float32
-        ssm_state = torch.zeros(
-            batch_size,
-            self.d_model * self.expand,
-            self.d_state,
-            device=device,
-            dtype=ssm_dtype,
-        )
-        return conv_state, ssm_state
-    def _get_states_from_cache(
-        self, inference_params, batch_size, initialize_states=False
-    ):
-        assert self.layer_idx is not None
-        if self.layer_idx not in inference_params.key_value_memory_dict:
-            batch_shape = (batch_size,)
-            conv_state = torch.zeros(
-                batch_size,
-                self.d_model * self.expand,
-                self.d_conv,
-                device=self.conv1d.weight.device,
-                dtype=self.conv1d.weight.dtype,
-            )
-            ssm_state = torch.zeros(
-                batch_size,
-                self.d_model * self.expand,
-                self.d_state,
-                device=self.dt_proj.weight.device,
-                dtype=self.dt_proj.weight.dtype,
-                # dtype=torch.float32,
-            )
-            inference_params.key_value_memory_dict[self.layer_idx] = (
-                conv_state,
-                ssm_state,
-            )
-        else:
-            conv_state, ssm_state = inference_params.key_value_memory_dict[
-                self.layer_idx
-            ]
-            # TODO: What if batch size changes between generation, and we reuse the same states?
-            if initialize_states:
-                conv_state.zero_()
-                ssm_state.zero_()
-        return conv_state, ssm_state

build/torch210-cxx11-cu126-x86_64-linux/modules/mha.py DELETED Viewed

@@ -1,294 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-try:
-    from flash_attn import flash_attn_with_kvcache
-except ImportError:
-    flash_attn_with_kvcache = None
-try:
-    from flash_attn.layers.rotary import RotaryEmbedding
-except ImportError:
-    RotaryEmbedding = None
-try:
-    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
-except ImportError:
-    causal_conv1d_fn, causal_conv1d_update = None, None
-def _update_kv_cache(kv, inference_params, layer_idx):
-    """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
-    # Pre-allocate memory for key-values for inference.
-    num_heads, head_dim = kv.shape[-2:]
-    assert layer_idx in inference_params.key_value_memory_dict
-    kv_cache, _ = inference_params.key_value_memory_dict[layer_idx]
-    # Adjust key and value for inference
-    batch_start = inference_params.batch_size_offset
-    batch_end = batch_start + kv.shape[0]
-    sequence_start = inference_params.seqlen_offset
-    sequence_end = sequence_start + kv.shape[1]
-    assert batch_end <= kv_cache.shape[0]
-    assert sequence_end <= kv_cache.shape[1]
-    assert kv_cache is not None
-    kv_cache[batch_start:batch_end, sequence_start:sequence_end, ...] = kv
-    return kv_cache[batch_start:batch_end, :sequence_end, ...]
-class MHA(nn.Module):
-    """Multi-head self-attention and cross-attention"""
-    def __init__(
-        self,
-        embed_dim,
-        num_heads,
-        num_heads_kv=None,
-        head_dim=None,  # If None, use embed_dim // num_heads
-        mlp_dim=0,
-        qkv_proj_bias=True,
-        out_proj_bias=True,
-        softmax_scale=None,
-        causal=False,
-        layer_idx=None,
-        d_conv=0,
-        rotary_emb_dim=0,
-        rotary_emb_base=10000.0,
-        rotary_emb_interleaved=False,
-        device=None,
-        dtype=None,
-    ) -> None:
-        """
-        num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads.
-        return_residual: whether to return the input x along with the output. This is for
-            performance reason: for post-norm architecture, returning the input allows us
-            to fuse the backward of nn.Linear with the residual connection.
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.layer_idx = layer_idx
-        self.d_conv = d_conv
-        self.rotary_emb_dim = rotary_emb_dim
-        self.softmax_scale = softmax_scale
-        self.causal = causal
-        self.num_heads = num_heads
-        self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
-        assert (
-            self.num_heads % self.num_heads_kv == 0
-        ), "num_heads must be divisible by num_heads_kv"
-        if head_dim is None:
-            assert self.embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
-        self.head_dim = head_dim if head_dim is not None else self.embed_dim // num_heads
-        self.mlp_dim = math.ceil(mlp_dim / 256) * 256
-        qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
-        out_dim = self.head_dim * self.num_heads
-        if self.rotary_emb_dim > 0:
-            assert RotaryEmbedding is not None, "rotary requires flash_attn to be installed"
-            self.rotary_emb = RotaryEmbedding(
-                self.rotary_emb_dim,
-                base=rotary_emb_base,
-                interleaved=rotary_emb_interleaved,
-                device=device,
-            )
-        self.in_proj = nn.Linear(embed_dim, qkv_dim + self.mlp_dim, bias=qkv_proj_bias, **factory_kwargs)
-        if self.d_conv > 0:
-            self.conv1d = nn.Conv1d(
-                qkv_dim, qkv_dim, kernel_size=self.d_conv, padding=self.d_conv - 1, groups=qkv_dim,
-                **factory_kwargs
-            )
-        self.out_proj = nn.Linear(out_dim + self.mlp_dim // 2, embed_dim, bias=out_proj_bias, **factory_kwargs)
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
-        dtype = self.out_proj.weight.dtype if dtype is None else dtype
-        device = self.out_proj.weight.device
-        if self.d_conv > 0:
-            conv_state = torch.zeros(
-                batch_size, self.conv1d.weight.shape[0], self.d_conv, device=device, dtype=dtype
-            )
-        else:
-            conv_state = None
-        kv_cache = torch.empty(
-            batch_size, max_seqlen, 2, self.num_heads_kv, self.head_dim, dtype=dtype, device=device,
-        )
-        return kv_cache, conv_state
-    def _update_kv_cache(self, kv, inference_params):
-        """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
-        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
-        return _update_kv_cache(kv, inference_params, self.layer_idx)
-    def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params):
-        """
-        Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention.
-        q: (batch_size, seqlen_q, nheads, head_dim)
-        kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim)
-        """
-        assert inference_params is not None and inference_params.seqlen_offset > 0
-        if self.rotary_emb_dim > 0:
-            self.rotary_emb._update_cos_sin_cache(
-                inference_params.max_seqlen, device=q.device, dtype=q.dtype
-            )
-            rotary_cos, rotary_sin = self.rotary_emb._cos_cached, self.rotary_emb._sin_cached
-        else:
-            rotary_cos, rotary_sin = None, None
-        batch = q.shape[0]
-        kv_cache, _ = inference_params.key_value_memory_dict[self.layer_idx]
-        kv_cache = kv_cache[:batch]
-        cache_seqlens = (
-            inference_params.lengths_per_sample[:batch]
-            if inference_params.lengths_per_sample is not None
-            else inference_params.seqlen_offset
-        )
-        assert flash_attn_with_kvcache is not None, "flash_attn must be installed"
-        context = flash_attn_with_kvcache(
-            q,
-            kv_cache[:, :, 0],
-            kv_cache[:, :, 1],
-            kv[:, :, 0],
-            kv[:, :, 1],
-            rotary_cos=rotary_cos,
-            rotary_sin=rotary_sin,
-            cache_seqlens=cache_seqlens,
-            softmax_scale=self.softmax_scale,
-            causal=self.causal,
-            rotary_interleaved=self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False,
-        )
-        return context
-    def _update_kvcache_attention(self, q, kv, inference_params):
-        """Write kv to inference_params, then do attention"""
-        if (
-            inference_params.seqlen_offset == 0
-            or flash_attn_with_kvcache is None
-        ):
-            # TODO: this only uses seqlen_offset and not lengths_per_sample.
-            kv = self._update_kv_cache(kv, inference_params)
-            k, v = kv.unbind(dim=-3)
-            k = torch.repeat_interleave(k, dim=2, repeats=self.num_heads // self.num_heads_kv)
-            v = torch.repeat_interleave(v, dim=2, repeats=self.num_heads // self.num_heads_kv)
-            return F.scaled_dot_product_attention(
-                q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), is_causal=self.causal, scale=self.softmax_scale
-            ).transpose(1, 2)
-        else:
-            batch = q.shape[0]
-            kv_cache, _ = inference_params.key_value_memory_dict[self.layer_idx]
-            kv_cache = kv_cache[:batch]
-            cache_seqlens = (
-                inference_params.lengths_per_sample[:batch]
-                if inference_params.lengths_per_sample is not None
-                else inference_params.seqlen_offset
-            )
-            return flash_attn_with_kvcache(
-                q,
-                kv_cache[:, :, 0],
-                kv_cache[:, :, 1],
-                kv[:, :, 0],
-                kv[:, :, 1],
-                cache_seqlens=cache_seqlens,
-                softmax_scale=self.softmax_scale,
-                causal=self.causal,
-            )
-    def forward(self, x, inference_params=None):
-        """
-        Arguments:
-            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if
-                cu_seqlens is None and max_seqlen is None, else (total, hidden_dim) where total
-                is the is the sum of the sequence lengths in the batch.
-            inference_params: for generation. Adapted from Megatron-LM (and Apex)
-            https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
-        """
-        if inference_params is not None and self.layer_idx not in inference_params.key_value_memory_dict:
-            inference_params.key_value_memory_dict[self.layer_idx] = self.allocate_inference_cache(
-                x.shape[0], inference_params.max_seqlen, dtype=x.dtype
-            )
-        seqlen_offset = (
-            0
-            if inference_params is None
-            else (
-                inference_params.lengths_per_sample
-                if inference_params.lengths_per_sample is not None
-                else inference_params.seqlen_offset
-            )
-        )
-        rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None
-        qkv = self.in_proj(x)
-        if self.mlp_dim > 0:
-            qkv, x_mlp = qkv.split([qkv.shape[-1] - self.mlp_dim, self.mlp_dim], dim=-1)
-            x_mlp_up, x_mlp_gate = x_mlp.chunk(2, dim=-1)
-            x_mlp = x_mlp_up * F.silu(x_mlp_gate)
-        if self.d_conv > 0:
-            # The inference code for conv1d is pretty messy, should clean it up
-            if (inference_params is None or inference_params.seqlen_offset == 0):
-                if causal_conv1d_fn is None:
-                    qkv = rearrange(
-                        self.conv1d(rearrange(qkv, "b s d -> b d s"))[..., :-(self.d_conv - 1)], "b d s -> b s d"
-                    ).contiguous()
-                else:
-                    qkv = causal_conv1d_fn(
-                        qkv.transpose(1, 2),
-                        rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                        self.conv1d.bias
-                    ).transpose(1, 2)
-                if inference_params is not None:
-                    _, conv_state = inference_params.key_value_memory_dict[self.layer_idx]
-                    # If we just take qkv[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
-                    # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
-                    qkv_t = rearrange(qkv, "b l d -> b d l")
-                    conv_state.copy_(F.pad(qkv_t, (self.d_conv - qkv_t.shape[-1], 0)))  # Update state (B D W)
-            else:
-                _, conv_state = inference_params.key_value_memory_dict[self.layer_idx]
-                assert qkv.shape[1] == 1, "Only support decoding with 1 token at a time for now"
-                qkv = qkv.squeeze(1)
-                # Conv step
-                if causal_conv1d_update is None:
-                    conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
-                    conv_state[:, :, -1] = qkv
-                    qkv = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
-                    if self.conv1d.bias is not None:
-                        qkv = qkv + self.conv1d.bias
-                else:
-                    qkv = causal_conv1d_update(
-                        qkv,
-                        conv_state,
-                        rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                        self.conv1d.bias
-                    )
-                qkv = qkv.unsqueeze(1)
-        q, kv = qkv.split([self.num_heads * self.head_dim, self.num_heads_kv * 2 * self.head_dim], dim=-1)
-        q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
-        kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
-        if (
-            inference_params is None
-            or inference_params.seqlen_offset == 0
-            or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
-        ):
-            if self.rotary_emb_dim > 0:
-                q, kv = self.rotary_emb(
-                    q, kv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
-                )
-            if inference_params is None:
-                k, v = kv.unbind(dim=-3)
-                k = torch.repeat_interleave(k, dim=2, repeats=self.num_heads // self.num_heads_kv)
-                v = torch.repeat_interleave(v, dim=2, repeats=self.num_heads // self.num_heads_kv)
-                context = F.scaled_dot_product_attention(
-                    q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), is_causal=self.causal, scale=self.softmax_scale
-                ).transpose(1, 2)
-            else:
-                context = self._update_kvcache_attention(q, kv, inference_params)
-        else:
-            context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
-        context = rearrange(context, "... h d -> ... (h d)")
-        if self.mlp_dim > 0:
-            context = torch.cat([context, x_mlp], dim=-1)
-        out = self.out_proj(context)
-        return out

build/torch210-cxx11-cu126-x86_64-linux/modules/mlp.py DELETED Viewed

@@ -1,34 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-from torch import nn
-from torch.nn import functional as F
-class GatedMLP(nn.Module):
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        activation=F.silu,
-        bias=False,
-        multiple_of=128,
-        device=None,
-        dtype=None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        out_features = out_features if out_features is not None else in_features
-        hidden_features = (
-            hidden_features if hidden_features is not None else int(8 * in_features / 3)
-        )
-        hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
-        self.fc1 = nn.Linear(in_features, 2 * hidden_features, bias=bias, **factory_kwargs)
-        self.activation = activation
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, **factory_kwargs)
-    def forward(self, x):
-        y = self.fc1(x)
-        y, gate = y.chunk(2, dim=-1)
-        y = y * self.activation(gate)
-        y = self.fc2(y)
-        return y

build/torch210-cxx11-cu126-x86_64-linux/modules/ssd_minimal.py DELETED Viewed

@@ -1,111 +0,0 @@
-# Copyright (c) 2024, Albert Gu and Tri Dao.
-"""Minimal implementation of SSD.
-This is the same as Listing 1 from the paper.
-"""
-import torch
-import torch.nn.functional as F
-from einops import rearrange, repeat
-from ..ops.triton.ssd_combined import mamba_chunk_scan_combined
-def segsum_unstable(x):
-    """Naive segment sum calculation."""
-    T = x.size(-1)
-    x_cumsum = torch.cumsum(x, dim=-1)
-    x_segsum = x_cumsum[..., :, None] - x_cumsum[..., None, :]
-    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool), diagonal=0)
-    x_segsum = x_segsum.masked_fill(~mask, -torch.inf)
-    return x_segsum
-def segsum(x):
-    """More stable segment sum calculation."""
-    T = x.size(-1)
-    x = repeat(x, "... d -> ... d e", e=T)
-    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool), diagonal=-1)
-    x = x.masked_fill(~mask, 0)
-    x_segsum = torch.cumsum(x, dim=-2)
-    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool), diagonal=0)
-    x_segsum = x_segsum.masked_fill(~mask, -torch.inf)
-    return x_segsum
-def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
-    """
-    Arguments:
-        X: (batch, length, n_heads, d_head)
-        A: (batch, length, n_heads)
-        B: (batch, length, n_heads, d_state)
-        C: (batch, length, n_heads, d_state)
-    Return:
-        Y: (batch, length, n_heads, d_head)
-    """
-    assert X.dtype == A.dtype == B.dtype == C.dtype
-    assert X.shape[1] % block_len == 0
-    # Rearrange into blocks/chunks
-    X, A, B, C = [
-        rearrange(x, "b (c l) ... -> b c l ...", l=block_len) for x in (X, A, B, C)
-    ]
-    A = rearrange(A, "b c l h -> b h c l")
-    A_cumsum = torch.cumsum(A, dim=-1)
-    # 1. Compute the output for each intra-chunk (diagonal blocks)
-    L = torch.exp(segsum(A))
-    Y_diag = torch.einsum("bclhn,bcshn,bhcls,bcshp->bclhp", C, B, L, X)
-    # 2. Compute the state for each intra-chunk
-    # (right term of low-rank factorization of off-diagonal blocks; B terms)
-    decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
-    states = torch.einsum("bclhn,bhcl,bclhp->bchpn", B, decay_states, X)
-    # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
-    # (middle term of factorization of off-diag blocks; A terms)
-    if initial_states is None:
-        initial_states = torch.zeros_like(states[:, :1])
-    states = torch.cat([initial_states, states], dim=1)
-    decay_chunk = torch.exp(segsum(F.pad(A_cumsum[:, :, :, -1], (1, 0))))
-    new_states = torch.einsum("bhzc,bchpn->bzhpn", decay_chunk, states)
-    states, final_state = new_states[:, :-1], new_states[:, -1]
-    # 4. Compute state -> output conversion per chunk
-    # (left term of low-rank factorization of off-diagonal blocks; C terms)
-    state_decay_out = torch.exp(A_cumsum)
-    Y_off = torch.einsum("bclhn,bchpn,bhcl->bclhp", C, states, state_decay_out)
-    # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks)
-    Y = rearrange(Y_diag + Y_off, "b c l h p -> b (c l) h p")
-    return Y, final_state
-# Simple test
-def test_correctness():
-    torch.manual_seed(42)
-    ## Dimensions
-    # Denoted (B, T, Q, D, P) in the paper
-    batch, seqlen, chunk_size, dim, headdim = 1, 2048, 64, 2048, 64
-    nheads = dim // headdim  # (H) in the paper
-    ngroups = 1  # (G) in the paper
-    dstate = 64  # (N) in the paper
-    dtype = torch.float32
-    device = "cuda"
-    x = torch.randn(batch, seqlen, nheads, headdim, dtype=dtype, device=device)
-    dt = F.softplus(
-        torch.randn(batch, seqlen, nheads, dtype=torch.float32, device=device) - 4
-    ).requires_grad_()
-    A = (
-        -torch.exp(torch.rand(nheads, dtype=torch.float32, device=device))
-    ).requires_grad_()
-    B = torch.randn(batch, seqlen, ngroups, dstate, dtype=dtype, device=device)
-    C = torch.randn(batch, seqlen, ngroups, dstate, dtype=dtype, device=device)
-    D = torch.randn(nheads, dtype=dtype, device=device)
-    # Comparing fused version and minimal version
-    y = mamba_chunk_scan_combined(x, dt, A, B, C, chunk_size, D=None)
-    y_min, _ = ssd_minimal_discrete(x * dt.unsqueeze(-1), A * dt, B, C, chunk_size)

build/torch210-cxx11-cu126-x86_64-linux/ops/__init__.py DELETED Viewed

File without changes

build/torch210-cxx11-cu126-x86_64-linux/ops/selective_scan_interface.py DELETED Viewed

@@ -1,446 +0,0 @@
-# Copyright (c) 2023, Tri Dao, Albert Gu.
-import torch
-import torch.nn.functional as F
-from ..utils.torch import custom_fwd, custom_bwd
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn
-    from causal_conv1d.causal_conv1d_interface import causal_conv1d_cuda
-except ImportError:
-    causal_conv1d_fn = None
-    causal_conv1d_cuda = None
-from .triton.layer_norm import _layer_norm_fwd
-from .._ops import ops
-class SelectiveScanFn(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
-                return_last_state=False):
-        if u.stride(-1) != 1:
-            u = u.contiguous()
-        if delta.stride(-1) != 1:
-            delta = delta.contiguous()
-        if D is not None:
-            D = D.contiguous()
-        if B.stride(-1) != 1:
-            B = B.contiguous()
-        if C.stride(-1) != 1:
-            C = C.contiguous()
-        if z is not None and z.stride(-1) != 1:
-            z = z.contiguous()
-        if B.dim() == 3:
-            B = rearrange(B, "b dstate l -> b 1 dstate l")
-            ctx.squeeze_B = True
-        if C.dim() == 3:
-            C = rearrange(C, "b dstate l -> b 1 dstate l")
-            ctx.squeeze_C = True
-        out, x, *rest = ops.selective_scan_fwd(
-            u, delta, A, B, C, D, z, delta_bias, delta_softplus
-        )
-        ctx.delta_softplus = delta_softplus
-        ctx.has_z = z is not None
-        last_state = x[:, :, -1, 1::2]  # (batch, dim, dstate)
-        if not ctx.has_z:
-            ctx.save_for_backward(u, delta, A, B, C, D, delta_bias, x)
-            return out if not return_last_state else (out, last_state)
-        else:
-            ctx.save_for_backward(u, delta, A, B, C, D, z, delta_bias, x, out)
-            out_z = rest[0]
-            return out_z if not return_last_state else (out_z, last_state)
-    @staticmethod
-    def backward(ctx, dout, *args):
-        if not ctx.has_z:
-            u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors
-            z = None
-            out = None
-        else:
-            u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
-        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-        # backward of selective_scan_cuda with the backward of chunk).
-        # Here we just pass in None and dz will be allocated in the C++ code.
-        du, ddelta, dA, dB, dC, dD, ddelta_bias, *rest = ops.selective_scan_bwd(
-            u,
-            delta,
-            A,
-            B,
-            C,
-            D,
-            z,
-            delta_bias,
-            dout,
-            x,
-            out,
-            None,
-            ctx.delta_softplus,
-            False,  # option to recompute out_z, not used here
-        )
-        dz = rest[0] if ctx.has_z else None
-        dB = dB.squeeze(1) if getattr(ctx, "squeeze_B", False) else dB
-        dC = dC.squeeze(1) if getattr(ctx, "squeeze_C", False) else dC
-        return (du, ddelta, dA, dB, dC,
-                dD if D is not None else None,
-                dz,
-                ddelta_bias if delta_bias is not None else None,
-                None,
-                None)
-def rms_norm_forward(
-    x,
-    weight,
-    bias,
-    eps=1e-6,
-    is_rms_norm=True,
-):
-    # x (b l) d
-    if x.stride(-1) != 1:
-        x = x.contiguous()
-    weight = weight.contiguous()
-    if bias is not None:
-        bias = bias.contiguous()
-    y = _layer_norm_fwd(
-        x, weight, bias, eps, None, residual_dtype=None, is_rms_norm=is_rms_norm
-    )[0]
-    # y (b l) d
-    return y
-def selective_scan_fn(u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
-                     return_last_state=False):
-    """if return_last_state is True, returns (out, last_state)
-    last_state has shape (batch, dim, dstate). Note that the gradient of the last state is
-    not considered in the backward pass.
-    """
-    return SelectiveScanFn.apply(u, delta, A, B, C, D, z, delta_bias, delta_softplus, return_last_state)
-def selective_scan_ref(u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
-                      return_last_state=False):
-    """
-    u: r(B D L)
-    delta: r(B D L)
-    A: c(D N) or r(D N)
-    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
-    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
-    D: r(D)
-    z: r(B D L)
-    delta_bias: r(D), fp32
-    out: r(B D L)
-    last_state (optional): r(B D dstate) or c(B D dstate)
-    """
-    dtype_in = u.dtype
-    u = u.float()
-    delta = delta.float()
-    if delta_bias is not None:
-        delta = delta + delta_bias[..., None].float()
-    if delta_softplus:
-        delta = F.softplus(delta)
-    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
-    is_variable_B = B.dim() >= 3
-    is_variable_C = C.dim() >= 3
-    if A.is_complex():
-        if is_variable_B:
-            B = torch.view_as_complex(rearrange(B.float(), "... (L two) -> ... L two", two=2))
-        if is_variable_C:
-            C = torch.view_as_complex(rearrange(C.float(), "... (L two) -> ... L two", two=2))
-    else:
-        B = B.float()
-        C = C.float()
-    x = A.new_zeros((batch, dim, dstate))
-    ys = []
-    deltaA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
-    if not is_variable_B:
-        deltaB_u = torch.einsum('bdl,dn,bdl->bdln', delta, B, u)
-    else:
-        if B.dim() == 3:
-            deltaB_u = torch.einsum('bdl,bnl,bdl->bdln', delta, B, u)
-        else:
-            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
-            deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
-    if is_variable_C and C.dim() == 4:
-        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
-    last_state = None
-    for i in range(u.shape[2]):
-        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
-        if not is_variable_C:
-            y = torch.einsum('bdn,dn->bd', x, C)
-        else:
-            if C.dim() == 3:
-                y = torch.einsum('bdn,bn->bd', x, C[:, :, i])
-            else:
-                y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
-        if i == u.shape[2] - 1:
-            last_state = x
-        if y.is_complex():
-            y = y.real * 2
-        ys.append(y)
-    y = torch.stack(ys, dim=2) # (batch dim L)
-    out = y if D is None else y + u * rearrange(D, "d -> d 1")
-    if z is not None:
-        out = out * F.silu(z)
-    out = out.to(dtype=dtype_in)
-    return out if not return_last_state else (out, last_state)
-class MambaInnerFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
-                out_proj_weight, out_proj_bias,
-                A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
-                C_proj_bias=None, delta_softplus=True, checkpoint_lvl=1, b_rms_weight=None, c_rms_weight= None, dt_rms_weight= None, b_c_dt_rms_eps=1e-6):
-        """
-             xz: (batch, dim, seqlen)
-        """
-        assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        assert checkpoint_lvl in [0, 1]
-        L = xz.shape[-1]
-        delta_rank = delta_proj_weight.shape[1]
-        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-        if torch.is_autocast_enabled():
-            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            delta_proj_weight = delta_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            out_proj_bias = (out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype())
-                             if out_proj_bias is not None else None)
-        if xz.stride(-1) != 1:
-            xz = xz.contiguous()
-        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
-        x, z = xz.chunk(2, dim=1)
-        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
-        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
-            x, conv1d_weight, conv1d_bias, None, None, None, True
-        )
-        # We're being very careful here about the layout, to avoid extra transposes.
-        # We want delta to have d as the slowest moving dimension
-        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
-        x_dbl = F.linear(rearrange(conv1d_out, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
-        delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l = L)
-        ctx.is_variable_B = B is None
-        ctx.is_variable_C = C is None
-        ctx.B_proj_bias_is_None = B_proj_bias is None
-        ctx.C_proj_bias_is_None = C_proj_bias is None
-        if B is None:  # variable B
-            B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl dstate)
-            if B_proj_bias is not None:
-                B = B + B_proj_bias.to(dtype=B.dtype)
-            if not A.is_complex():
-                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
-                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            else:
-                B = rearrange(B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
-        else:
-            if B.stride(-1) != 1:
-                B = B.contiguous()
-        if C is None:  # variable C
-            C = x_dbl[:, -d_state:]  # (bl dstate)
-            if C_proj_bias is not None:
-                C = C + C_proj_bias.to(dtype=C.dtype)
-            if not A.is_complex():
-                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
-                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            else:
-                C = rearrange(C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
-        else:
-            if C.stride(-1) != 1:
-                C = C.contiguous()
-        if D is not None:
-            D = D.contiguous()
-        if b_rms_weight is not None:
-            B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-            B = rms_norm_forward(B, b_rms_weight, bias=None, eps=b_c_dt_rms_eps)
-            B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        if c_rms_weight is not None:
-            C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-            C = rms_norm_forward(C, c_rms_weight, bias=None, eps=b_c_dt_rms_eps)
-            C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        if dt_rms_weight is not None:
-            delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
-            delta = rms_norm_forward(delta, dt_rms_weight, bias=None, eps=b_c_dt_rms_eps)
-            delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
-        out, scan_intermediates, out_z = ops.selective_scan_fwd(
-            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
-        )
-        ctx.delta_softplus = delta_softplus
-        ctx.out_proj_bias_is_None = out_proj_bias is None
-        ctx.checkpoint_lvl = checkpoint_lvl
-        ctx.b_rms_weight = b_rms_weight
-        ctx.c_rms_weight = c_rms_weight
-        ctx.dt_rms_weight = dt_rms_weight
-        ctx.b_c_dt_rms_eps = b_c_dt_rms_eps
-        if checkpoint_lvl >= 1:  # Will recompute conv1d_out and delta in the backward pass
-            conv1d_out, delta = None, None
-        ctx.save_for_backward(xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight,
-                              delta_proj_weight, out_proj_weight, conv1d_out, delta,
-                              A, B, C, D, delta_bias, scan_intermediates, b_rms_weight, c_rms_weight, dt_rms_weight, out)
-        return F.linear(rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias)
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout):
-        # dout: (batch, seqlen, dim)
-        assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        (xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight, delta_proj_weight, out_proj_weight,
-         conv1d_out, delta, A, B, C, D, delta_bias, scan_intermediates, b_rms_weight, c_rms_weight, dt_rms_weight, out) = ctx.saved_tensors
-        L = xz.shape[-1]
-        delta_rank = delta_proj_weight.shape[1]
-        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-        x, z = xz.chunk(2, dim=1)
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
-        if ctx.checkpoint_lvl == 1:
-            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
-                x, conv1d_weight, conv1d_bias, None, None, None, True
-            )
-            delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(),
-                              "d (b l) -> b d l", l = L)
-            if dt_rms_weight is not None:
-                delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
-                delta = rms_norm_forward(delta, ctx.dt_rms_weight, None, ctx.b_c_dt_rms_eps)
-                delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
-            if b_rms_weight is not None:
-                # Recompute & RMSNorm B
-                B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-                B = rms_norm_forward(
-                    B, ctx.b_rms_weight, None, ctx.b_c_dt_rms_eps
-                )
-                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-            if c_rms_weight is not None:
-                # Recompute & RMSNorm C
-                C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
-                C = rms_norm_forward(
-                    C, ctx.c_rms_weight, None, ctx.b_c_dt_rms_eps
-                )
-                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
-        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-        # backward of selective_scan_cuda with the backward of chunk).
-        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
-        dx, dz = dxz.chunk(2, dim=1)
-        dout = rearrange(dout, "b l e -> e (b l)")
-        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
-        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = (
-            ops.selective_scan_bwd(
-                conv1d_out,
-                delta,
-                A,
-                B,
-                C,
-                D,
-                z,
-                delta_bias,
-                dout_y,
-                scan_intermediates,
-                out,
-                dz,
-                ctx.delta_softplus,
-                True,  # option to recompute out_z
-            )
-        )
-        dout_proj_weight = torch.einsum(
-            "eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)")
-        )
-        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
-        dD = dD if D is not None else None
-        dx_dbl = torch.empty_like(x_dbl)
-        dB_proj_bias = None
-        if ctx.is_variable_B:
-            if not A.is_complex():
-                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
-            else:
-                dB = rearrange(dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
-            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
-            dx_dbl[:, delta_rank:delta_rank + d_state] = dB  # (bl d)
-            dB = None
-        dC_proj_bias = None
-        if ctx.is_variable_C:
-            if not A.is_complex():
-                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
-            else:
-                dC = rearrange(dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
-            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
-            dx_dbl[:, -d_state:] = dC  # (bl d)
-            dC = None
-        ddelta = rearrange(ddelta, "b d l -> d (b l)")
-        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
-        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
-        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
-        dx_proj_weight = torch.einsum("Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d"))
-        dconv1d_out = torch.addmm(dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out)
-        dconv1d_out = rearrange(dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1])
-        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
-        # backward of conv1d with the backward of chunk).
-        dx, dconv1d_weight, dconv1d_bias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
-            x, conv1d_weight, conv1d_bias, dconv1d_out, None, None, None, dx, False, True
-        )
-        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
-        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
-        return (dxz, dconv1d_weight, dconv1d_bias, dx_proj_weight, ddelta_proj_weight,
-                dout_proj_weight, dout_proj_bias,
-                dA, dB, dC, dD,
-                ddelta_bias if delta_bias is not None else None,
-                # 6-None are delta_softplus, checkpoint_lvl, b_rms_weight, c_rms_weight, dt_rms_weight, b_c_dt_rms_eps
-                dB_proj_bias, dC_proj_bias, None, None, None, None, None, None)
-def mamba_inner_fn(
-    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
-    out_proj_weight, out_proj_bias,
-    A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
-    C_proj_bias=None, delta_softplus=True, checkpoint_lvl=1, b_rms_weight= None, c_rms_weight= None, dt_rms_weight= None, b_c_dt_rms_eps=1e-6
-):
-    return MambaInnerFn.apply(xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
-                              out_proj_weight, out_proj_bias,
-                              A, B, C, D, delta_bias, B_proj_bias, C_proj_bias, delta_softplus, checkpoint_lvl, b_rms_weight, c_rms_weight, dt_rms_weight, b_c_dt_rms_eps)
-def mamba_inner_ref(
-    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
-    out_proj_weight, out_proj_bias,
-    A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
-    C_proj_bias=None, delta_softplus=True
-):
-    assert causal_conv1d_fn is not None, "causal_conv1d_fn is not available. Please install causal-conv1d."
-    L = xz.shape[-1]
-    delta_rank = delta_proj_weight.shape[1]
-    d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
-    x, z = xz.chunk(2, dim=1)
-    x = causal_conv1d_fn(x, rearrange(conv1d_weight, "d 1 w -> d w"), conv1d_bias, activation="silu")
-    # We're being very careful here about the layout, to avoid extra transposes.
-    # We want delta to have d as the slowest moving dimension
-    # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
-    x_dbl = F.linear(rearrange(x, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
-    delta = delta_proj_weight @ x_dbl[:, :delta_rank].t()
-    delta = rearrange(delta, "d (b l) -> b d l", l=L)
-    if B is None:  # variable B
-        B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl d)
-        if B_proj_bias is not None:
-            B = B + B_proj_bias.to(dtype=B.dtype)
-        if not A.is_complex():
-            B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
-        else:
-            B = rearrange(B, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
-    if C is None:  # variable B
-        C = x_dbl[:, -d_state:]  # (bl d)
-        if C_proj_bias is not None:
-            C = C + C_proj_bias.to(dtype=C.dtype)
-        if not A.is_complex():
-            C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
-        else:
-            C = rearrange(C, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
-    y = selective_scan_fn(x, delta, A, B, C, D, z=z, delta_bias=delta_bias, delta_softplus=True)
-    return F.linear(rearrange(y, "b d l -> b l d"), out_proj_weight, out_proj_bias)

build/torch210-cxx11-cu126-x86_64-linux/ops/triton/__init__.py DELETED Viewed

File without changes

build/torch210-cxx11-cu126-x86_64-linux/ops/triton/k_activations.py DELETED Viewed

@@ -1,169 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-import torch
-import triton
-import triton.language as tl
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_N': 32}),
-        triton.Config({'BLOCK_N': 64}),
-        triton.Config({'BLOCK_N': 128}),
-        triton.Config({'BLOCK_N': 256}),
-        triton.Config({'BLOCK_N': 512}),
-        triton.Config({'BLOCK_N': 1024}),
-    ],
-    key=['ncols'],
-)
-@triton.jit
-def _swiglu_fwd_kernel(
-    X,
-    Y,
-    OUT,
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_out_row,
-    ncols,
-    BLOCK_N: tl.constexpr,
-):
-    # Map the program id to the row of X and Y it should compute.
-    row = tl.program_id(0)
-    start_col = tl.program_id(1) * BLOCK_N
-    X += row * stride_x_row
-    Y += row * stride_y_row
-    OUT += row * stride_out_row
-    cols = start_col + tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < ncols, other=0.).to(tl.float32)
-    y = tl.load(Y + cols, mask=cols < ncols, other=0.).to(tl.float32)
-    out = x * tl.sigmoid(x) * y
-    tl.store(OUT + cols, out, mask=cols < ncols)
-def _swiglu_fwd(xy, out=None):
-    if xy.stride(-1) != 1:
-        xy = xy.contiguous()
-    batch_shape = xy.shape[:-1]
-    xy = xy.reshape(-1, xy.shape[-1])
-    x, y = xy.chunk(2, dim=-1)
-    if out is None:
-        out = torch.empty_like(x)
-    else:
-        out = out.reshape(-1, out.shape[-1])
-        assert out.shape == x.shape
-    assert out.stride(-1) == 1
-    M, N = x.shape
-    grid = lambda META: (M, triton.cdiv(N, META['BLOCK_N']))
-    with torch.cuda.device(x.device.index):
-        _swiglu_fwd_kernel[grid](x, y, out, x.stride(0), y.stride(0), out.stride(0), N)
-    return out.reshape(*batch_shape, out.shape[-1])
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_N': 32}),
-        triton.Config({'BLOCK_N': 64}),
-        triton.Config({'BLOCK_N': 128}),
-        triton.Config({'BLOCK_N': 256}),
-        triton.Config({'BLOCK_N': 512}),
-        triton.Config({'BLOCK_N': 1024}),
-    ],
-    key=['ncols'],
-)
-@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["OUT"] is not None})
-@triton.jit
-def _swiglu_bwd_kernel(
-    X,
-    Y,
-    DOUT,
-    OUT,
-    DX,
-    DY,
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_dout_row,
-    stride_out_row,
-    stride_dx_row,
-    stride_dy_row,
-    ncols,
-    BLOCK_N: tl.constexpr,
-    RECOMPUTE_OUTPUT: tl.constexpr,
-):
-    # Map the program id to the row of X and Y it should compute.
-    row = tl.program_id(0)
-    start_col = tl.program_id(1) * BLOCK_N
-    X += row * stride_x_row
-    Y += row * stride_y_row
-    DOUT += row * stride_dout_row
-    if RECOMPUTE_OUTPUT:
-        OUT += row * stride_out_row
-    DX += row * stride_dx_row
-    DY += row * stride_dy_row
-    cols = start_col + tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < ncols, other=0.).to(tl.float32)
-    y = tl.load(Y + cols, mask=cols < ncols, other=0.).to(tl.float32)
-    dout = tl.load(DOUT + cols, mask=cols < ncols, other=0.).to(tl.float32)
-    x_sigmoid = tl.sigmoid(x)
-    dx = x_sigmoid * (1 + x * (1 - x_sigmoid)) * y * dout
-    dy = x * x_sigmoid * dout
-    tl.store(DX + cols, dx, mask=cols < ncols)
-    tl.store(DY + cols, dy, mask=cols < ncols)
-    if RECOMPUTE_OUTPUT:
-        out = x * x_sigmoid * y
-        tl.store(OUT + cols, out, mask=cols < ncols)
-def _swiglu_bwd(xy, dout, dxy=None, recompute_output=False, out=None):
-    if xy.stride(-1) != 1:
-        xy = xy.contiguous()
-    if dout.stride(-1) != 1:
-        dout = dout.contiguous()
-    batch_shape = xy.shape[:-1]
-    xy = xy.reshape(-1, xy.shape[-1])
-    x, y = xy.chunk(2, dim=-1)
-    dout = dout.reshape(-1, dout.shape[-1])
-    assert dout.shape == x.shape
-    if dxy is None:
-        dxy = torch.empty_like(xy)
-    else:
-        dxy = dxy.reshape(-1, dxy.shape[-1])
-        assert dxy.shape == xy.shape
-    dx, dy = dxy.chunk(2, dim=-1)
-    assert dx.stride(-1) == 1
-    assert dy.stride(-1) == 1
-    if recompute_output:
-        if out is None:
-            out = torch.empty_like(x)
-        else:
-            out = out.reshape(-1, out.shape[-1])
-            assert out.shape == x.shape
-        assert out.stride(-1) == 1
-    M, N = x.shape
-    grid = lambda META: (M, triton.cdiv(N, META['BLOCK_N']))
-    with torch.cuda.device(x.device.index):
-        _swiglu_bwd_kernel[grid](x, y, dout, out if recompute_output else None, dx, dy,
-                                 x.stride(0), y.stride(0), dout.stride(0),
-                                 out.stride(0) if recompute_output else 0,
-                                 dx.stride(0), dy.stride(0),
-                                 N)
-    if not recompute_output:
-        return dxy.reshape(*batch_shape, dxy.shape[-1])
-    else:
-        return dxy.reshape(*batch_shape, dxy.shape[-1]), out.reshape(*batch_shape, out.shape[-1])
-class SwiGLU(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, xy):
-        ctx.save_for_backward(xy)
-        return _swiglu_fwd(xy)
-    @staticmethod
-    def backward(ctx, dout):
-        xy, = ctx.saved_tensors
-        return _swiglu_bwd(xy, dout)
-swiglu = SwiGLU.apply

build/torch210-cxx11-cu126-x86_64-linux/ops/triton/layer_norm.py DELETED Viewed

@@ -1,1113 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# Implement dropout + residual + layer_norm / rms_norm.
-# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
-# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
-# This is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
-# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
-import math
-import warnings
-import torch
-import torch.nn.functional as F
-from ...utils.torch import custom_bwd, custom_fwd
-import triton
-import triton.language as tl
-def layer_norm_ref(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    dropout_mask=None,
-    dropout_mask1=None,
-    upcast=False,
-):
-    dtype = x.dtype
-    if upcast:
-        x = x.float()
-        weight = weight.float()
-        bias = bias.float() if bias is not None else None
-        residual = residual.float() if residual is not None else residual
-        x1 = x1.float() if x1 is not None else None
-        weight1 = weight1.float() if weight1 is not None else None
-        bias1 = bias1.float() if bias1 is not None else None
-    if x1 is not None:
-        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-    if rowscale is not None:
-        x = x * rowscale[..., None]
-    if dropout_p > 0.0:
-        if dropout_mask is not None:
-            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
-        else:
-            x = F.dropout(x, p=dropout_p)
-        if x1 is not None:
-            if dropout_mask1 is not None:
-                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
-            else:
-                x1 = F.dropout(x1, p=dropout_p)
-    if x1 is not None:
-        x = x + x1
-    if residual is not None:
-        x = (x + residual).to(x.dtype)
-    out = F.layer_norm(x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps).to(
-        dtype
-    )
-    if weight1 is None:
-        return out if not prenorm else (out, x)
-    else:
-        out1 = F.layer_norm(
-            x.to(weight1.dtype), x.shape[-1:], weight=weight1, bias=bias1, eps=eps
-        ).to(dtype)
-        return (out, out1) if not prenorm else (out, out1, x)
-def rms_norm_ref(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    dropout_mask=None,
-    dropout_mask1=None,
-    upcast=False,
-):
-    dtype = x.dtype
-    if upcast:
-        x = x.float()
-        weight = weight.float()
-        bias = bias.float() if bias is not None else None
-        residual = residual.float() if residual is not None else residual
-        x1 = x1.float() if x1 is not None else None
-        weight1 = weight1.float() if weight1 is not None else None
-        bias1 = bias1.float() if bias1 is not None else None
-    if x1 is not None:
-        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-    if rowscale is not None:
-        x = x * rowscale[..., None]
-    if dropout_p > 0.0:
-        if dropout_mask is not None:
-            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
-        else:
-            x = F.dropout(x, p=dropout_p)
-        if x1 is not None:
-            if dropout_mask1 is not None:
-                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
-            else:
-                x1 = F.dropout(x1, p=dropout_p)
-    if x1 is not None:
-        x = x + x1
-    if residual is not None:
-        x = (x + residual).to(x.dtype)
-    rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(dtype)
-    if weight1 is None:
-        return out if not prenorm else (out, x)
-    else:
-        out1 = ((x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)).to(
-            dtype
-        )
-        return (out, out1) if not prenorm else (out, out1, x)
-def config_prune(configs):
-    if torch.version.hip:
-        try:
-            # set warp size based on gcn architecure
-            gcn_arch_name = torch.cuda.get_device_properties(0).gcnArchName
-            if "gfx10" in gcn_arch_name or "gfx11" in gcn_arch_name:
-                # radeon
-                warp_size = 32
-            else:
-                # instinct
-                warp_size = 64
-        except AttributeError as e:
-            # fall back to crude method to set warp size
-            device_name = torch.cuda.get_device_properties(0).name
-            if 'instinct' in device_name.lower():
-                warp_size = 64
-            else:
-                warp_size = 32
-            warnings.warn(f"{e}, warp size set to {warp_size} based on device name: {device_name}", UserWarning)
-    else:
-        # cuda
-        warp_size = 32
-    max_block_sz = 1024
-    max_num_warps = max_block_sz // warp_size
-    pruned_configs = [config for config in configs if config.num_warps <= max_num_warps]
-    return pruned_configs
-configs_autotune = [
-        triton.Config({}, num_warps=1),
-        triton.Config({}, num_warps=2),
-        triton.Config({}, num_warps=4),
-        triton.Config({}, num_warps=8),
-        triton.Config({}, num_warps=16),
-        triton.Config({}, num_warps=32),
-        ]
-pruned_configs_autotune = config_prune(configs_autotune)
-@triton.autotune(
-    configs = pruned_configs_autotune,
-    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
-)
-# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-# @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
-@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
-@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
-@triton.jit
-def _layer_norm_fwd_1pass_kernel(
-    X,  # pointer to the input
-    Y,  # pointer to the output
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    RESIDUAL,  # pointer to the residual
-    X1,
-    W1,
-    B1,
-    Y1,
-    RESIDUAL_OUT,  # pointer to the residual
-    ROWSCALE,
-    SEEDS,  # Dropout seeds for each row
-    DROPOUT_MASK,
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_res_row,
-    stride_res_out_row,
-    stride_x1_row,
-    stride_y1_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    dropout_p,  # Dropout probability
-    IS_RMS_NORM: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    HAS_RESIDUAL: tl.constexpr,
-    STORE_RESIDUAL_OUT: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_DROPOUT: tl.constexpr,
-    STORE_DROPOUT_MASK: tl.constexpr,
-    HAS_ROWSCALE: tl.constexpr,
-    HAS_X1: tl.constexpr,
-    HAS_W1: tl.constexpr,
-    HAS_B1: tl.constexpr,
-):
-    # Map the program id to the row of X and Y it should compute.
-    row = tl.program_id(0)
-    X += row * stride_x_row
-    Y += row * stride_y_row
-    if HAS_RESIDUAL:
-        RESIDUAL += row * stride_res_row
-    if STORE_RESIDUAL_OUT:
-        RESIDUAL_OUT += row * stride_res_out_row
-    if HAS_X1:
-        X1 += row * stride_x1_row
-    if HAS_W1:
-        Y1 += row * stride_y1_row
-    # Compute mean and variance
-    cols = tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
-    if HAS_ROWSCALE:
-        rowscale = tl.load(ROWSCALE + row).to(tl.float32)
-        x *= rowscale
-    if HAS_DROPOUT:
-        # Compute dropout mask
-        # 7 rounds is good enough, and reduces register pressure
-        keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
-        x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
-        if STORE_DROPOUT_MASK:
-            tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
-    if HAS_X1:
-        x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32)
-        if HAS_ROWSCALE:
-            rowscale = tl.load(ROWSCALE + M + row).to(tl.float32)
-            x1 *= rowscale
-        if HAS_DROPOUT:
-            # Compute dropout mask
-            # 7 rounds is good enough, and reduces register pressure
-            keep_mask = (
-                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
-            )
-            x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
-            if STORE_DROPOUT_MASK:
-                tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
-        x += x1
-    if HAS_RESIDUAL:
-        residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
-        x += residual
-    if STORE_RESIDUAL_OUT:
-        tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
-    if not IS_RMS_NORM:
-        mean = tl.sum(x, axis=0) / N
-        tl.store(Mean + row, mean)
-        xbar = tl.where(cols < N, x - mean, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    else:
-        xbar = tl.where(cols < N, x, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    rstd = 1 / tl.sqrt(var + eps)
-    tl.store(Rstd + row, rstd)
-    # Normalize and apply linear transformation
-    mask = cols < N
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if HAS_BIAS:
-        b = tl.load(B + cols, mask=mask).to(tl.float32)
-    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-    y = x_hat * w + b if HAS_BIAS else x_hat * w
-    # Write output
-    tl.store(Y + cols, y, mask=mask)
-    if HAS_W1:
-        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-        if HAS_B1:
-            b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
-        y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
-        tl.store(Y1 + cols, y1, mask=mask)
-def _layer_norm_fwd(
-    x,
-    weight,
-    bias,
-    eps,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    dropout_p=0.0,
-    rowscale=None,
-    out_dtype=None,
-    residual_dtype=None,
-    is_rms_norm=False,
-    return_dropout_mask=False,
-):
-    if residual is not None:
-        residual_dtype = residual.dtype
-    M, N = x.shape
-    assert x.stride(-1) == 1
-    if residual is not None:
-        assert residual.stride(-1) == 1
-        assert residual.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    if x1 is not None:
-        assert x1.shape == x.shape
-        assert rowscale is None
-        assert x1.stride(-1) == 1
-    if weight1 is not None:
-        assert weight1.shape == (N,)
-        assert weight1.stride(-1) == 1
-    if bias1 is not None:
-        assert bias1.shape == (N,)
-        assert bias1.stride(-1) == 1
-    if rowscale is not None:
-        assert rowscale.is_contiguous()
-        assert rowscale.shape == (M,)
-    # allocate output
-    y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
-    assert y.stride(-1) == 1
-    if weight1 is not None:
-        y1 = torch.empty_like(y)
-        assert y1.stride(-1) == 1
-    else:
-        y1 = None
-    if (
-        residual is not None
-        or (residual_dtype is not None and residual_dtype != x.dtype)
-        or dropout_p > 0.0
-        or rowscale is not None
-        or x1 is not None
-    ):
-        residual_out = torch.empty(
-            M, N, device=x.device, dtype=residual_dtype if residual_dtype is not None else x.dtype
-        )
-        assert residual_out.stride(-1) == 1
-    else:
-        residual_out = None
-    mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None
-    rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
-    if dropout_p > 0.0:
-        seeds = torch.randint(
-            2**32, (M if x1 is None else 2 * M,), device=x.device, dtype=torch.int64
-        )
-    else:
-        seeds = None
-    if return_dropout_mask and dropout_p > 0.0:
-        dropout_mask = torch.empty(M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool)
-    else:
-        dropout_mask = None
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-    if N > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    with torch.cuda.device(x.device.index):
-        _layer_norm_fwd_1pass_kernel[(M,)](
-            x,
-            y,
-            weight,
-            bias,
-            residual,
-            x1,
-            weight1,
-            bias1,
-            y1,
-            residual_out,
-            rowscale,
-            seeds,
-            dropout_mask,
-            mean,
-            rstd,
-            x.stride(0),
-            y.stride(0),
-            residual.stride(0) if residual is not None else 0,
-            residual_out.stride(0) if residual_out is not None else 0,
-            x1.stride(0) if x1 is not None else 0,
-            y1.stride(0) if y1 is not None else 0,
-            M,
-            N,
-            eps,
-            dropout_p,
-            is_rms_norm,
-            BLOCK_N,
-            residual is not None,
-            residual_out is not None,
-            bias is not None,
-            dropout_p > 0.0,
-            dropout_mask is not None,
-            rowscale is not None,
-        )
-    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
-    if dropout_mask is not None and x1 is not None:
-        dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
-    else:
-        dropout_mask1 = None
-    return (
-        y,
-        y1,
-        mean,
-        rstd,
-        residual_out if residual_out is not None else x,
-        seeds,
-        dropout_mask,
-        dropout_mask1,
-    )
-@triton.autotune(
-    configs=pruned_configs_autotune,
-    key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS", "HAS_DROPOUT"],
-)
-# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-# @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
-# @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
-@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
-@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
-@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
-@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
-@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
-@triton.jit
-def _layer_norm_bwd_kernel(
-    X,  # pointer to the input
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    Y,  # pointer to the output to be recomputed
-    DY,  # pointer to the output gradient
-    DX,  # pointer to the input gradient
-    DW,  # pointer to the partial sum of weights gradient
-    DB,  # pointer to the partial sum of biases gradient
-    DRESIDUAL,
-    W1,
-    DY1,
-    DX1,
-    DW1,
-    DB1,
-    DRESIDUAL_IN,
-    ROWSCALE,
-    SEEDS,
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_dy_row,
-    stride_dx_row,
-    stride_dres_row,
-    stride_dy1_row,
-    stride_dx1_row,
-    stride_dres_in_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    dropout_p,
-    rows_per_program,
-    IS_RMS_NORM: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    HAS_DRESIDUAL: tl.constexpr,
-    STORE_DRESIDUAL: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_DROPOUT: tl.constexpr,
-    HAS_ROWSCALE: tl.constexpr,
-    HAS_DY1: tl.constexpr,
-    HAS_DX1: tl.constexpr,
-    HAS_B1: tl.constexpr,
-    RECOMPUTE_OUTPUT: tl.constexpr,
-):
-    # Map the program id to the elements of X, DX, and DY it should compute.
-    row_block_id = tl.program_id(0)
-    row_start = row_block_id * rows_per_program
-    # Do not early exit if row_start >= M, because we need to write DW and DB
-    cols = tl.arange(0, BLOCK_N)
-    mask = cols < N
-    X += row_start * stride_x_row
-    if HAS_DRESIDUAL:
-        DRESIDUAL += row_start * stride_dres_row
-    if STORE_DRESIDUAL:
-        DRESIDUAL_IN += row_start * stride_dres_in_row
-    DY += row_start * stride_dy_row
-    DX += row_start * stride_dx_row
-    if HAS_DY1:
-        DY1 += row_start * stride_dy1_row
-    if HAS_DX1:
-        DX1 += row_start * stride_dx1_row
-    if RECOMPUTE_OUTPUT:
-        Y += row_start * stride_y_row
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if RECOMPUTE_OUTPUT and HAS_BIAS:
-        b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
-    if HAS_DY1:
-        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    if HAS_BIAS:
-        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    if HAS_DY1:
-        dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
-        if HAS_B1:
-            db1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    row_end = min((row_block_id + 1) * rows_per_program, M)
-    for row in range(row_start, row_end):
-        # Load data to SRAM
-        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
-        dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
-        if HAS_DY1:
-            dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32)
-        if not IS_RMS_NORM:
-            mean = tl.load(Mean + row)
-        rstd = tl.load(Rstd + row)
-        # Compute dx
-        xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-        xhat = tl.where(mask, xhat, 0.0)
-        if RECOMPUTE_OUTPUT:
-            y = xhat * w + b if HAS_BIAS else xhat * w
-            tl.store(Y + cols, y, mask=mask)
-        wdy = w * dy
-        dw += dy * xhat
-        if HAS_BIAS:
-            db += dy
-        if HAS_DY1:
-            wdy += w1 * dy1
-            dw1 += dy1 * xhat
-            if HAS_B1:
-                db1 += dy1
-        if not IS_RMS_NORM:
-            c1 = tl.sum(xhat * wdy, axis=0) / N
-            c2 = tl.sum(wdy, axis=0) / N
-            dx = (wdy - (xhat * c1 + c2)) * rstd
-        else:
-            c1 = tl.sum(xhat * wdy, axis=0) / N
-            dx = (wdy - xhat * c1) * rstd
-        if HAS_DRESIDUAL:
-            dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
-            dx += dres
-        # Write dx
-        if STORE_DRESIDUAL:
-            tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
-        if HAS_DX1:
-            if HAS_DROPOUT:
-                keep_mask = (
-                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
-                )
-                dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
-            else:
-                dx1 = dx
-            tl.store(DX1 + cols, dx1, mask=mask)
-        if HAS_DROPOUT:
-            keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
-            dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
-        if HAS_ROWSCALE:
-            rowscale = tl.load(ROWSCALE + row).to(tl.float32)
-            dx *= rowscale
-        tl.store(DX + cols, dx, mask=mask)
-        X += stride_x_row
-        if HAS_DRESIDUAL:
-            DRESIDUAL += stride_dres_row
-        if STORE_DRESIDUAL:
-            DRESIDUAL_IN += stride_dres_in_row
-        if RECOMPUTE_OUTPUT:
-            Y += stride_y_row
-        DY += stride_dy_row
-        DX += stride_dx_row
-        if HAS_DY1:
-            DY1 += stride_dy1_row
-        if HAS_DX1:
-            DX1 += stride_dx1_row
-    tl.store(DW + row_block_id * N + cols, dw, mask=mask)
-    if HAS_BIAS:
-        tl.store(DB + row_block_id * N + cols, db, mask=mask)
-    if HAS_DY1:
-        tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask)
-        if HAS_B1:
-            tl.store(DB1 + row_block_id * N + cols, db1, mask=mask)
-def _layer_norm_bwd(
-    dy,
-    x,
-    weight,
-    bias,
-    eps,
-    mean,
-    rstd,
-    dresidual=None,
-    dy1=None,
-    weight1=None,
-    bias1=None,
-    seeds=None,
-    dropout_p=0.0,
-    rowscale=None,
-    has_residual=False,
-    has_x1=False,
-    is_rms_norm=False,
-    x_dtype=None,
-    recompute_output=False,
-):
-    M, N = x.shape
-    assert x.stride(-1) == 1
-    assert dy.stride(-1) == 1
-    assert dy.shape == (M, N)
-    if dresidual is not None:
-        assert dresidual.stride(-1) == 1
-        assert dresidual.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    if dy1 is not None:
-        assert weight1 is not None
-        assert dy1.shape == dy.shape
-        assert dy1.stride(-1) == 1
-    if weight1 is not None:
-        assert weight1.shape == (N,)
-        assert weight1.stride(-1) == 1
-    if bias1 is not None:
-        assert bias1.shape == (N,)
-        assert bias1.stride(-1) == 1
-    if seeds is not None:
-        assert seeds.is_contiguous()
-        assert seeds.shape == (M if not has_x1 else M * 2,)
-    if rowscale is not None:
-        assert rowscale.is_contiguous()
-        assert rowscale.shape == (M,)
-    # allocate output
-    dx = (
-        torch.empty_like(x)
-        if x_dtype is None
-        else torch.empty(M, N, dtype=x_dtype, device=x.device)
-    )
-    dresidual_in = (
-        torch.empty_like(x)
-        if has_residual
-        and (dx.dtype != x.dtype or dropout_p > 0.0 or rowscale is not None or has_x1)
-        else None
-    )
-    dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
-    y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None
-    if recompute_output:
-        assert weight1 is None, "recompute_output is not supported with parallel LayerNorm"
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-    if N > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
-    _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
-    _db = (
-        torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
-        if bias is not None
-        else None
-    )
-    _dw1 = torch.empty_like(_dw) if weight1 is not None else None
-    _db1 = torch.empty_like(_db) if bias1 is not None else None
-    rows_per_program = math.ceil(M / sm_count)
-    grid = (sm_count,)
-    with torch.cuda.device(x.device.index):
-        _layer_norm_bwd_kernel[grid](
-            x,
-            weight,
-            bias,
-            y,
-            dy,
-            dx,
-            _dw,
-            _db,
-            dresidual,
-            weight1,
-            dy1,
-            dx1,
-            _dw1,
-            _db1,
-            dresidual_in,
-            rowscale,
-            seeds,
-            mean,
-            rstd,
-            x.stride(0),
-            0 if not recompute_output else y.stride(0),
-            dy.stride(0),
-            dx.stride(0),
-            dresidual.stride(0) if dresidual is not None else 0,
-            dy1.stride(0) if dy1 is not None else 0,
-            dx1.stride(0) if dx1 is not None else 0,
-            dresidual_in.stride(0) if dresidual_in is not None else 0,
-            M,
-            N,
-            eps,
-            dropout_p,
-            rows_per_program,
-            is_rms_norm,
-            BLOCK_N,
-            dresidual is not None,
-            dresidual_in is not None,
-            bias is not None,
-            dropout_p > 0.0,
-        )
-    dw = _dw.sum(0).to(weight.dtype)
-    db = _db.sum(0).to(bias.dtype) if bias is not None else None
-    dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
-    db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
-    # Don't need to compute dresidual_in separately in this case
-    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
-        dresidual_in = dx
-    if has_x1 and dropout_p == 0.0:
-        dx1 = dx
-    return (
-        (dx, dw, db, dresidual_in, dx1, dw1, db1)
-        if not recompute_output
-        else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
-    )
-class LayerNormFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        weight,
-        bias,
-        residual=None,
-        x1=None,
-        weight1=None,
-        bias1=None,
-        eps=1e-6,
-        dropout_p=0.0,
-        rowscale=None,
-        prenorm=False,
-        residual_in_fp32=False,
-        is_rms_norm=False,
-        return_dropout_mask=False,
-    ):
-        x_shape_og = x.shape
-        # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
-        if residual is not None:
-            assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
-        if x1 is not None:
-            assert x1.shape == x_shape_og
-            assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-            x1 = x1.reshape(-1, x1.shape[-1])
-            if x1.stride(-1) != 1:
-                x1 = x1.contiguous()
-        weight = weight.contiguous()
-        if bias is not None:
-            bias = bias.contiguous()
-        if weight1 is not None:
-            weight1 = weight1.contiguous()
-        if bias1 is not None:
-            bias1 = bias1.contiguous()
-        if rowscale is not None:
-            rowscale = rowscale.reshape(-1).contiguous()
-        residual_dtype = (
-            residual.dtype
-            if residual is not None
-            else (torch.float32 if residual_in_fp32 else None)
-        )
-        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
-            x,
-            weight,
-            bias,
-            eps,
-            residual,
-            x1,
-            weight1,
-            bias1,
-            dropout_p=dropout_p,
-            rowscale=rowscale,
-            residual_dtype=residual_dtype,
-            is_rms_norm=is_rms_norm,
-            return_dropout_mask=return_dropout_mask,
-        )
-        ctx.save_for_backward(
-            residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
-        )
-        ctx.x_shape_og = x_shape_og
-        ctx.eps = eps
-        ctx.dropout_p = dropout_p
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_residual = residual is not None
-        ctx.has_x1 = x1 is not None
-        ctx.prenorm = prenorm
-        ctx.x_dtype = x.dtype
-        y = y.reshape(x_shape_og)
-        y1 = y1.reshape(x_shape_og) if y1 is not None else None
-        residual_out = residual_out.reshape(x_shape_og) if residual_out is not None else None
-        dropout_mask = dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
-        dropout_mask1 = dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
-        if not return_dropout_mask:
-            if weight1 is None:
-                return y if not prenorm else (y, residual_out)
-            else:
-                return (y, y1) if not prenorm else (y, y1, residual_out)
-        else:
-            if weight1 is None:
-                return (
-                    (y, dropout_mask, dropout_mask1)
-                    if not prenorm
-                    else (y, residual_out, dropout_mask, dropout_mask1)
-                )
-            else:
-                return (
-                    (y, y1, dropout_mask, dropout_mask1)
-                    if not prenorm
-                    else (y, y1, residual_out, dropout_mask, dropout_mask1)
-                )
-    @staticmethod
-    def backward(ctx, dy, *args):
-        x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
-        dy = dy.reshape(-1, dy.shape[-1])
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
-        if weight1 is not None:
-            dy1, args = args[0], args[1:]
-            dy1 = dy1.reshape(-1, dy1.shape[-1])
-            if dy1.stride(-1) != 1:
-                dy1 = dy1.contiguous()
-            assert dy1.shape == x.shape
-        else:
-            dy1 = None
-        if ctx.prenorm:
-            dresidual = args[0]
-            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
-            assert dresidual.shape == x.shape
-        else:
-            dresidual = None
-        dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
-            dy,
-            x,
-            weight,
-            bias,
-            ctx.eps,
-            mean,
-            rstd,
-            dresidual,
-            dy1,
-            weight1,
-            bias1,
-            seeds,
-            ctx.dropout_p,
-            rowscale,
-            ctx.has_residual,
-            ctx.has_x1,
-            ctx.is_rms_norm,
-            x_dtype=ctx.x_dtype,
-        )
-        return (
-            dx.reshape(ctx.x_shape_og),
-            dw,
-            db,
-            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
-            dx1.reshape(ctx.x_shape_og) if dx1 is not None else None,
-            dw1,
-            db1,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def layer_norm_fn(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-    return_dropout_mask=False,
-):
-    return LayerNormFn.apply(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-        return_dropout_mask,
-    )
-def rms_norm_fn(
-    x,
-    weight,
-    bias,
-    residual=None,
-    x1=None,
-    weight1=None,
-    bias1=None,
-    eps=1e-6,
-    dropout_p=0.0,
-    rowscale=None,
-    prenorm=False,
-    residual_in_fp32=False,
-    return_dropout_mask=False,
-):
-    return LayerNormFn.apply(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        True,
-        return_dropout_mask,
-    )
-class RMSNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.eps = eps
-        if dropout_p > 0.0:
-            self.drop = torch.nn.Dropout(dropout_p)
-        else:
-            self.drop = None
-        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
-        self.register_parameter("bias", None)
-        self.reset_parameters()
-    def reset_parameters(self):
-        torch.nn.init.ones_(self.weight)
-    def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
-        return rms_norm_fn(
-            x,
-            self.weight,
-            self.bias,
-            residual=residual,
-            eps=self.eps,
-            dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
-            prenorm=prenorm,
-            residual_in_fp32=residual_in_fp32,
-        )
-class LayerNormLinearFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        x,
-        norm_weight,
-        norm_bias,
-        linear_weight,
-        linear_bias,
-        residual=None,
-        eps=1e-6,
-        prenorm=False,
-        residual_in_fp32=False,
-        is_rms_norm=False,
-    ):
-        x_shape_og = x.shape
-        # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
-        if residual is not None:
-            assert residual.shape == x_shape_og
-            residual = residual.reshape(-1, residual.shape[-1])
-            if residual.stride(-1) != 1:
-                residual = residual.contiguous()
-        norm_weight = norm_weight.contiguous()
-        if norm_bias is not None:
-            norm_bias = norm_bias.contiguous()
-        residual_dtype = (
-            residual.dtype
-            if residual is not None
-            else (torch.float32 if residual_in_fp32 else None)
-        )
-        y, _, mean, rstd, residual_out, *rest = _layer_norm_fwd(
-            x,
-            norm_weight,
-            norm_bias,
-            eps,
-            residual,
-            out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype(),
-            residual_dtype=residual_dtype,
-            is_rms_norm=is_rms_norm,
-        )
-        y = y.reshape(x_shape_og)
-        dtype = torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
-        linear_weight = linear_weight.to(dtype)
-        linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
-        out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
-        # We don't store y, will be recomputed in the backward pass to save memory
-        ctx.save_for_backward(residual_out, norm_weight, norm_bias, linear_weight, mean, rstd)
-        ctx.x_shape_og = x_shape_og
-        ctx.eps = eps
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_residual = residual is not None
-        ctx.prenorm = prenorm
-        ctx.x_dtype = x.dtype
-        ctx.linear_bias_is_none = linear_bias is None
-        return out if not prenorm else (out, residual_out.reshape(x_shape_og))
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout, *args):
-        x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
-        dout = dout.reshape(-1, dout.shape[-1])
-        dy = F.linear(dout, linear_weight.t())
-        dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
-        if ctx.prenorm:
-            dresidual = args[0]
-            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
-            if dresidual.stride(-1) != 1:
-                dresidual = dresidual.contiguous()
-            assert dresidual.shape == x.shape
-        else:
-            dresidual = None
-        dx, dnorm_weight, dnorm_bias, dresidual_in, _, _, _, y = _layer_norm_bwd(
-            dy,
-            x,
-            norm_weight,
-            norm_bias,
-            ctx.eps,
-            mean,
-            rstd,
-            dresidual=dresidual,
-            has_residual=ctx.has_residual,
-            is_rms_norm=ctx.is_rms_norm,
-            x_dtype=ctx.x_dtype,
-            recompute_output=True,
-        )
-        dlinear_weight = torch.einsum("bo,bi->oi", dout, y)
-        return (
-            dx.reshape(ctx.x_shape_og),
-            dnorm_weight,
-            dnorm_bias,
-            dlinear_weight,
-            dlinear_bias,
-            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
-            None,
-            None,
-            None,
-            None,
-        )
-def layer_norm_linear_fn(
-    x,
-    norm_weight,
-    norm_bias,
-    linear_weight,
-    linear_bias,
-    residual=None,
-    eps=1e-6,
-    prenorm=False,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-):
-    return LayerNormLinearFn.apply(
-        x,
-        norm_weight,
-        norm_bias,
-        linear_weight,
-        linear_bias,
-        residual,
-        eps,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-    )

build/torch210-cxx11-cu126-x86_64-linux/ops/triton/layernorm_gated.py DELETED Viewed

@@ -1,437 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
-# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
-# This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
-# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange
-def rms_norm_ref(x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True):
-    dtype = x.dtype
-    N = x.shape[-1]
-    weight = weight.float()
-    bias = bias.float() if bias is not None else None
-    if upcast:
-        x = x.float()
-        z = z.float() if z is not None else z
-    if z is not None and not norm_before_gate:
-        x = x * F.silu(z)
-    if group_size is None:
-        rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-        out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
-    else:
-        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
-        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
-        out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
-        if bias is not None:
-            out = out + bias
-    if z is not None and norm_before_gate:
-        out *= F.silu(z)
-    return out.to(dtype)
-@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
-@triton.jit
-def _layer_norm_fwd_1pass_kernel(
-    X,  # pointer to the input
-    Y,  # pointer to the output
-    W,  # pointer to the weights
-    B,  # pointer to the biases
-    Z,  # pointer to the other branch
-    Mean,  # pointer to the mean
-    Rstd,  # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_y_row,
-    stride_z_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    BLOCK_N: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_Z: tl.constexpr,
-    NORM_BEFORE_GATE: tl.constexpr,
-    IS_RMS_NORM: tl.constexpr,
-):
-    # Map the program id to the row of X and Y it should compute.
-    row = tl.program_id(0)
-    group = tl.program_id(1)
-    X += row * stride_x_row + group * N
-    Y += row * stride_y_row + group * N
-    if HAS_Z:
-        Z += row * stride_z_row + group * N
-    if not IS_RMS_NORM:
-        Mean += group * M
-    Rstd += group * M
-    W += group * N
-    if HAS_BIAS:
-        B += group * N
-    # Compute mean and variance
-    cols = tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
-    if HAS_Z and not NORM_BEFORE_GATE:
-        z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
-        x *= z * tl.sigmoid(z)
-    if not IS_RMS_NORM:
-        mean = tl.sum(x, axis=0) / N
-        tl.store(Mean + row, mean)
-        xbar = tl.where(cols < N, x - mean, 0.)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    else:
-        xbar = tl.where(cols < N, x, 0.)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    rstd = 1 / tl.sqrt(var + eps)
-    tl.store(Rstd + row, rstd)
-    # Normalize and apply linear transformation
-    mask = cols < N
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if HAS_BIAS:
-        b = tl.load(B + cols, mask=mask).to(tl.float32)
-    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-    y = x_hat * w + b if HAS_BIAS else x_hat * w
-    if HAS_Z and NORM_BEFORE_GATE:
-        z = tl.load(Z + cols, mask=mask).to(tl.float32)
-        y *= z * tl.sigmoid(z)
-    # Write output
-    tl.store(Y + cols, y, mask=mask)
-def _layer_norm_fwd(x, weight, bias, eps, z=None, out=None, group_size=None, norm_before_gate=True, is_rms_norm=False):
-    M, N = x.shape
-    if group_size is None:
-        group_size = N
-    assert N % group_size == 0
-    ngroups = N // group_size
-    assert x.stride(-1) == 1
-    if z is not None:
-        assert z.stride(-1) == 1
-        assert z.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    # allocate output
-    if out is not None:
-        assert out.shape == x.shape
-    else:
-        out = torch.empty_like(x)
-    assert out.stride(-1) == 1
-    mean = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device) if not is_rms_norm else None
-    rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
-    if group_size > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    # heuristics for number of warps
-    num_warps = min(max(BLOCK_N // 256, 1), 8)
-    grid = (M, ngroups)
-    with torch.cuda.device(x.device.index):
-        _layer_norm_fwd_1pass_kernel[grid](x, out, weight, bias, z, mean, rstd,
-                                           x.stride(0), out.stride(0), z.stride(0) if z is not None else 0,
-                                           M, group_size, eps,
-                                           BLOCK_N=BLOCK_N,
-                                           NORM_BEFORE_GATE=norm_before_gate,
-                                           IS_RMS_NORM=is_rms_norm,
-                                           num_warps=num_warps)
-    return out, mean, rstd
-@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
-@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
-@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
-@triton.jit
-def _layer_norm_bwd_kernel(
-    X,   # pointer to the input
-    W,   # pointer to the weights
-    B,   # pointer to the biases
-    Z,   # pointer to the other branch
-    Y,   # pointer to the output to be recomputed
-    DY,  # pointer to the output gradient
-    DX,  # pointer to the input gradient
-    DW,  # pointer to the partial sum of weights gradient
-    DB,  # pointer to the partial sum of biases gradient
-    DZ,  # pointer to the other branch
-    Mean,   # pointer to the mean
-    Rstd,   # pointer to the 1/std
-    stride_x_row,  # how much to increase the pointer when moving by 1 row
-    stride_z_row,
-    stride_y_row,
-    stride_dy_row,
-    stride_dx_row,
-    stride_dz_row,
-    stride_dw_row,
-    stride_db_row,
-    M,  # number of rows in X
-    N,  # number of columns in X
-    eps,  # epsilon to avoid division by zero
-    rows_per_program,
-    NORM_BEFORE_GATE: tl.constexpr,
-    IS_RMS_NORM: tl.constexpr,
-    HAS_BIAS: tl.constexpr,
-    HAS_Z: tl.constexpr,
-    RECOMPUTE_OUTPUT: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    # Map the program id to the elements of X, DX, and DY it should compute.
-    row_block_id = tl.program_id(0)
-    group = tl.program_id(1)
-    row_start = row_block_id * rows_per_program
-    cols = tl.arange(0, BLOCK_N)
-    mask = cols < N
-    X += row_start * stride_x_row + group * N
-    if HAS_Z:
-        Z += row_start * stride_z_row + group * N
-        DZ += row_start * stride_dz_row + group * N
-    DY += row_start * stride_dy_row + group * N
-    DX += row_start * stride_dx_row + group * N
-    if RECOMPUTE_OUTPUT:
-        Y += row_start * stride_y_row + group * N
-    if not IS_RMS_NORM:
-        Mean += group * M
-    Rstd += group * M
-    W += group * N
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if (RECOMPUTE_OUTPUT or HAS_Z) and HAS_BIAS:
-        B += group * N
-        b = tl.load(B + cols, mask=mask, other=0.).to(tl.float32)
-    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    if HAS_BIAS:
-        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    row_end = min((row_block_id + 1) * rows_per_program, M)
-    for row in range(row_start, row_end):
-        # Load data to SRAM
-        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
-        dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
-        if not IS_RMS_NORM:
-            mean = tl.load(Mean + row)
-        if HAS_Z and not NORM_BEFORE_GATE:
-            z = tl.load(Z + cols, mask=mask, other=0.).to(tl.float32)
-            x_og = x
-            x = x_og * z * tl.sigmoid(z)
-        rstd = tl.load(Rstd + row)
-        # Compute dx
-        xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-        xhat = tl.where(mask, xhat, 0.)
-        if HAS_Z and NORM_BEFORE_GATE:
-            z = tl.load(Z + cols, mask=mask, other=0.).to(tl.float32)
-            z_sigmoid = tl.sigmoid(z)
-            y = xhat * w + b if HAS_BIAS else xhat * w
-            if RECOMPUTE_OUTPUT:
-                tl.store(Y + cols, y * z * z_sigmoid, mask=mask)
-            dz = dy * y * z_sigmoid * (1 + z * (1 - z_sigmoid))
-            tl.store(DZ + cols, dz, mask=mask)
-            dy *= z * z_sigmoid
-        else:
-            if RECOMPUTE_OUTPUT:
-                y = xhat * w + b if HAS_BIAS else xhat * w
-                tl.store(Y + cols, y, mask=mask)
-        wdy = w * dy
-        c1 = tl.sum(xhat * wdy, axis=0) / N
-        if not IS_RMS_NORM:
-            c2 = tl.sum(wdy, axis=0) / N
-            dx = (wdy - (xhat * c1 + c2)) * rstd
-        else:
-            dx = (wdy - xhat * c1) * rstd
-        dw += dy * xhat
-        if HAS_BIAS:
-            db += dy
-        if HAS_Z and not NORM_BEFORE_GATE:
-            z_sigmoid = tl.sigmoid(z)
-            dz = dx * x_og * z_sigmoid * (1 + z * (1 - z_sigmoid))
-            tl.store(DZ + cols, dz, mask=mask)
-            dx *= z * z_sigmoid
-        # Write dx
-        tl.store(DX + cols, dx, mask=mask)
-        X += stride_x_row
-        if HAS_Z:
-            Z += stride_z_row
-            DZ += stride_dz_row
-        if RECOMPUTE_OUTPUT:
-            Y += stride_y_row
-        DY += stride_dy_row
-        DX += stride_dx_row
-    tl.store(DW + row_block_id * stride_dw_row + group * N + cols, dw, mask=mask)
-    if HAS_BIAS:
-        tl.store(DB + row_block_id * stride_db_row + group * N + cols, db, mask=mask)
-def _layer_norm_bwd(dy, x, weight, bias, eps, mean, rstd, z=None, group_size=None,
-                    norm_before_gate=True, is_rms_norm=False, recompute_output=False, dz=None, out=None):
-    M, N = x.shape
-    if group_size is None:
-        group_size = N
-    assert N % group_size == 0
-    ngroups = N // group_size
-    assert x.stride(-1) == 1
-    assert dy.stride(-1) == 1
-    assert dy.shape == (M, N)
-    if z is not None:
-        assert z.stride(-1) == 1
-        assert z.shape == (M, N)
-    assert weight.shape == (N,)
-    assert weight.stride(-1) == 1
-    if bias is not None:
-        assert bias.stride(-1) == 1
-        assert bias.shape == (N,)
-    # allocate output
-    dx = torch.empty_like(x)
-    if dz is not None:
-        assert z is not None
-        assert dz.shape == z.shape
-        assert dz.stride(-1) == 1
-    else:
-        dz = torch.empty_like(z) if z is not None else None
-    if recompute_output:
-        if out is None:
-            out = torch.empty_like(x)
-        assert out.shape == x.shape
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
-    if group_size > BLOCK_N:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    # heuristics for number of warps
-    num_warps = min(max(BLOCK_N // 256, 1), 8)
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
-    # If group size is small (e.g., 64), we're only using 1 warp. So having just 108 programs
-    # would limit the occupancy.
-    nrow_groups = math.ceil(sm_count * math.ceil(4 / num_warps) / ngroups)
-    _dw = torch.empty((nrow_groups, N), dtype=torch.float32, device=weight.device)
-    _db = torch.empty((nrow_groups, N), dtype=torch.float32, device=bias.device) if bias is not None else None
-    rows_per_program = math.ceil(M / nrow_groups)
-    grid = (nrow_groups, ngroups)
-    with torch.cuda.device(x.device.index):
-        _layer_norm_bwd_kernel[grid](x, weight, bias, z, out if recompute_output else None,
-                                     dy, dx, _dw, _db, dz, mean, rstd,
-                                     x.stride(0),
-                                     z.stride(0) if z is not None else 0,
-                                     0 if not recompute_output else out.stride(0),
-                                     dy.stride(0), dx.stride(0),
-                                     dz.stride(0) if dz is not None else 0,
-                                     _dw.stride(0),
-                                     _db.stride(0) if _db is not None else 0,
-                                     M, group_size, eps,
-                                     rows_per_program,
-                                     BLOCK_N=BLOCK_N,
-                                     NORM_BEFORE_GATE=norm_before_gate,
-                                     IS_RMS_NORM=is_rms_norm,
-                                     num_warps=num_warps)
-    dw = _dw.sum(0).to(weight.dtype)
-    db = _db.sum(0).to(bias.dtype) if bias is not None else None
-    return (dx, dw, db, dz) if not recompute_output else (dx, dw, db, dz, out)
-class LayerNormFn(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True,
-                is_rms_norm=False):
-        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
-        """
-        x_shape_og = x.shape
-        # reshape input data into 2D tensor
-        x = x.reshape(-1, x.shape[-1])
-        if x.stride(-1) != 1:
-            x = x.contiguous()
-        if z is not None:
-            assert z.shape == x_shape_og
-            z = z.reshape(-1, z.shape[-1])
-            if z.stride(-1) != 1:
-                z = z.contiguous()
-        weight = weight.contiguous()
-        if bias is not None:
-            bias = bias.contiguous()
-        y, mean, rstd = _layer_norm_fwd(x, weight, bias, eps, z=z, group_size=group_size, norm_before_gate=norm_before_gate, is_rms_norm=is_rms_norm)
-        ctx.save_for_backward(x, weight, bias, mean, rstd, z)
-        ctx.x_shape_og = x_shape_og
-        ctx.eps = eps
-        ctx.group_size = group_size
-        ctx.norm_before_gate = norm_before_gate
-        ctx.is_rms_norm = is_rms_norm
-        return y.reshape(x_shape_og)
-    @staticmethod
-    def backward(ctx, dy):
-        x, weight, bias, mean, rstd, z = ctx.saved_tensors
-        dy = dy.reshape(-1, dy.shape[-1])
-        if dy.stride(-1) != 1:
-            dy = dy.contiguous()
-        assert dy.shape == x.shape
-        dx, dw, db, dz = _layer_norm_bwd(dy, x, weight, bias, ctx.eps, mean, rstd, z, ctx.group_size,
-                                         ctx.norm_before_gate, ctx.is_rms_norm)
-        return dx.reshape(ctx.x_shape_og), dw, db, dz.reshape(ctx.x_shape_og) if dz is not None else None, None, None, None, None
-def layernorm_fn(x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, is_rms_norm=False):
-    return LayerNormFn.apply(x, weight, bias, z, eps, group_size, norm_before_gate, is_rms_norm)
-def rmsnorm_fn(x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True):
-    return LayerNormFn.apply(x, weight, bias, z, eps, group_size, norm_before_gate, True)
-class LayerNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5, group_size=None, norm_before_gate=True, device=None, dtype=None):
-        """If group_size is not None, we do GroupNorm with each group having group_size elements.
-        group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.eps = eps
-        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
-        self.bias = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
-        self.group_size = group_size
-        self.norm_before_gate = norm_before_gate
-        self.reset_parameters()
-    def reset_parameters(self):
-        torch.nn.init.ones_(self.weight)
-        torch.nn.init.zeros_(self.bias)
-    def forward(self, x, z=None):
-        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
-        """
-        return layernorm_fn(x, self.weight, self.bias, z=z, group_size=self.group_size, eps=self.eps,
-                            norm_before_gate=self.norm_before_gate)
-class RMSNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5, group_size=None, norm_before_gate=True, device=None, dtype=None):
-        """If group_size is not None, we do GroupNorm with each group having group_size elements.
-        group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.eps = eps
-        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
-        self.register_parameter("bias", None)
-        self.group_size = group_size
-        self.norm_before_gate = norm_before_gate
-        self.reset_parameters()
-    def reset_parameters(self):
-        torch.nn.init.ones_(self.weight)
-    def forward(self, x, z=None):
-        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
-        """
-        return rmsnorm_fn(x, self.weight, self.bias, z=z, eps=self.eps, group_size=self.group_size,
-                          norm_before_gate=self.norm_before_gate)

build/torch210-cxx11-cu126-x86_64-linux/ops/triton/selective_state_update.py DELETED Viewed

@@ -1,285 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or triton==2.2.0 or triton==2.3.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-from .softplus import softplus
-@triton.heuristics({"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
-@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
-@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
-@triton.heuristics({"HAS_STATE_BATCH_INDICES": lambda args: args["state_batch_indices_ptr"] is not None})
-@triton.heuristics({"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])})
-@triton.jit
-def _selective_scan_update_kernel(
-    # Pointers to matrices
-    state_ptr, x_ptr, dt_ptr, dt_bias_ptr, A_ptr, B_ptr, C_ptr, D_ptr, z_ptr, out_ptr, state_batch_indices_ptr,
-    # Matrix dimensions
-    batch, nheads, dim, dstate, nheads_ngroups_ratio,
-    # Strides
-    stride_state_batch, stride_state_head, stride_state_dim, stride_state_dstate,
-    stride_x_batch, stride_x_head, stride_x_dim,
-    stride_dt_batch, stride_dt_head, stride_dt_dim,
-    stride_dt_bias_head, stride_dt_bias_dim,
-    stride_A_head, stride_A_dim, stride_A_dstate,
-    stride_B_batch, stride_B_group, stride_B_dstate,
-    stride_C_batch, stride_C_group, stride_C_dstate,
-    stride_D_head, stride_D_dim,
-    stride_z_batch, stride_z_head, stride_z_dim,
-    stride_out_batch, stride_out_head, stride_out_dim,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    TIE_HDIM: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    HAS_D: tl.constexpr,
-    HAS_Z: tl.constexpr,
-    HAS_STATE_BATCH_INDICES: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_m = tl.program_id(axis=0)
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
-    out_ptrs = out_ptr + offs_m * stride_out_dim
-    if HAS_STATE_BATCH_INDICES:
-        state_batch_indices_ptr += pid_b
-        state_batch_idx = tl.load(state_batch_indices_ptr)
-        # Skip padding tokens
-        if state_batch_idx < 0:
-            tl.store(out_ptrs, 0.0, mask=offs_m < dim)
-            return
-        state_ptr += state_batch_idx * stride_state_batch + pid_h * stride_state_head
-    else:
-        state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
-    x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
-    dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
-    if HAS_DT_BIAS:
-        dt_bias_ptr += pid_h * stride_dt_bias_head
-    A_ptr += pid_h * stride_A_head
-    B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
-    C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
-    if HAS_Z:
-        z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
-    offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
-    state_ptrs = state_ptr + (offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate)
-    x_ptrs = x_ptr + offs_m * stride_x_dim
-    dt_ptrs = dt_ptr + offs_m * stride_dt_dim
-    if HAS_DT_BIAS:
-        dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
-    if HAS_D:
-        D_ptr += pid_h * stride_D_head
-    A_ptrs = A_ptr + (offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate)
-    B_ptrs = B_ptr + offs_n * stride_B_dstate
-    C_ptrs = C_ptr + offs_n * stride_C_dstate
-    if HAS_D:
-        D_ptrs = D_ptr + offs_m * stride_D_dim
-    if HAS_Z:
-        z_ptrs = z_ptr + offs_m * stride_z_dim
-    state = tl.load(state_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0)
-    x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = tl.where(dt <= 20.0, softplus(dt), dt)
-        A = tl.load(A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0).to(tl.float32)
-        dA = tl.exp(A * dt[:, None])
-    else:
-        dt = tl.load(dt_ptr).to(tl.float32)
-        if HAS_DT_BIAS:
-            dt += tl.load(dt_bias_ptr).to(tl.float32)
-        if DT_SOFTPLUS:
-            dt = tl.where(dt <= 20.0, softplus(dt), dt)
-        A = tl.load(A_ptr).to(tl.float32)
-        dA = tl.exp(A * dt)  # scalar, not a matrix
-    B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
-    if HAS_D:
-        D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if HAS_Z:
-        z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    if not TIE_HDIM:
-        dB = B[None, :] * dt[:, None]
-    else:
-        dB = B * dt  # vector of size (dstate,)
-    state = state * dA + dB * x[:, None]
-    tl.store(state_ptrs, state, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate))
-    out = tl.sum(state * C[None, :], axis=1)
-    if HAS_D:
-        out += x * D
-    if HAS_Z:
-        out *= z * tl.sigmoid(z)
-    tl.store(out_ptrs, out, mask=offs_m < dim)
-def selective_state_update(state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False,
-                           state_batch_indices=None):
-    """
-    Argument:
-        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
-        x: (batch, dim) or (batch, nheads, dim)
-        dt: (batch, dim) or (batch, nheads, dim)
-        A: (dim, dstate) or (nheads, dim, dstate)
-        B: (batch, dstate) or (batch, ngroups, dstate)
-        C: (batch, dstate) or (batch, ngroups, dstate)
-        D: (dim,) or (nheads, dim)
-        z: (batch, dim) or (batch, nheads, dim)
-        dt_bias: (dim,) or (nheads, dim)
-    Return:
-        out: (batch, dim) or (batch, nheads, dim)
-    """
-    has_heads = state.dim() > 3
-    if state.dim() == 3:
-        state = state.unsqueeze(1)
-    if x.dim() == 2:
-        x = x.unsqueeze(1)
-    if dt.dim() == 2:
-        dt = dt.unsqueeze(1)
-    if A.dim() == 2:
-        A = A.unsqueeze(0)
-    if B.dim() == 2:
-        B = B.unsqueeze(1)
-    if C.dim() == 2:
-        C = C.unsqueeze(1)
-    if D is not None and D.dim() == 1:
-        D = D.unsqueeze(0)
-    if z is not None and z.dim() == 2:
-        z = z.unsqueeze(1)
-    if dt_bias is not None and dt_bias.dim() == 1:
-        dt_bias = dt_bias.unsqueeze(0)
-    _, nheads, dim, dstate = state.shape
-    batch = x.shape[0]
-    if x.shape != (batch, nheads, dim):
-        print(f"{state.shape} {x.shape} {batch} {nheads} {dim}")
-    assert x.shape == (batch, nheads, dim)
-    assert dt.shape == x.shape
-    assert A.shape == (nheads, dim, dstate)
-    ngroups = B.shape[1]
-    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
-    assert B.shape == (batch, ngroups, dstate)
-    assert C.shape == B.shape
-    if D is not None:
-        assert D.shape == (nheads, dim)
-    if z is not None:
-        assert z.shape == x.shape
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads, dim)
-    if state_batch_indices is not None:
-        assert state_batch_indices.shape == (batch,)
-    out = torch.empty_like(x)
-    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE_M']), batch, nheads)
-    z_strides = ((z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0))
-    # We don't want autotune since it will overwrite the state
-    # We instead tune by hand.
-    BLOCK_SIZE_M, num_warps = ((32, 4) if dstate <= 16
-                               else ((16, 4) if dstate <= 32 else
-                                     ((8, 4) if dstate <= 64 else
-                                      ((4, 4) if dstate <= 128 else
-                                       ((4, 8))))))
-    tie_hdim = A.stride(-1) == 0 and A.stride(-2) == 0 and dt.stride(-1) == 0 and dt_bias.stride(-1) == 0
-    with torch.cuda.device(x.device.index):
-        _selective_scan_update_kernel[grid](
-            state, x, dt, dt_bias, A, B, C, D, z, out, state_batch_indices,
-            batch, nheads, dim, dstate, nheads // ngroups,
-            state.stride(0), state.stride(1), state.stride(2), state.stride(3),
-            x.stride(0), x.stride(1), x.stride(2),
-            dt.stride(0), dt.stride(1), dt.stride(2),
-            *(dt_bias.stride(0), dt_bias.stride(1)) if dt_bias is not None else 0,
-            A.stride(0), A.stride(1), A.stride(2),
-            B.stride(0), B.stride(1), B.stride(2),
-            C.stride(0), C.stride(1), C.stride(2),
-            *(D.stride(0), D.stride(1)) if D is not None else 0,
-            z_strides[0], z_strides[1], z_strides[2],
-            out.stride(0), out.stride(1), out.stride(2),
-            dt_softplus,
-            tie_hdim,
-            BLOCK_SIZE_M,
-            num_warps=num_warps,
-        )
-    if not has_heads:
-        out = out.squeeze(1)
-    return out
-def selective_state_update_ref(state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False):
-    """
-    Argument:
-        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
-        x: (batch, dim) or (batch, nheads, dim)
-        dt: (batch, dim) or (batch, nheads, dim)
-        A: (dim, dstate) or (nheads, dim, dstate)
-        B: (batch, dstate) or (batch, ngroups, dstate)
-        C: (batch, dstate) or (batch, ngroups, dstate)
-        D: (dim,) or (nheads, dim)
-        z: (batch, dim) or (batch, nheads, dim)
-        dt_bias: (dim,) or (nheads, dim)
-    Return:
-        out: (batch, dim) or (batch, nheads, dim)
-    """
-    has_heads = state.dim() > 3
-    if state.dim() == 3:
-        state = state.unsqueeze(1)
-    if x.dim() == 2:
-        x = x.unsqueeze(1)
-    if dt.dim() == 2:
-        dt = dt.unsqueeze(1)
-    if A.dim() == 2:
-        A = A.unsqueeze(0)
-    if B.dim() == 2:
-        B = B.unsqueeze(1)
-    if C.dim() == 2:
-        C = C.unsqueeze(1)
-    if D is not None and D.dim() == 1:
-        D = D.unsqueeze(0)
-    if z is not None and z.dim() == 2:
-        z = z.unsqueeze(1)
-    if dt_bias is not None and dt_bias.dim() == 1:
-        dt_bias = dt_bias.unsqueeze(0)
-    batch, nheads, dim, dstate = state.shape
-    assert x.shape == (batch, nheads, dim)
-    assert dt.shape == x.shape
-    assert A.shape == (nheads, dim, dstate)
-    ngroups = B.shape[1]
-    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
-    assert B.shape == (batch, ngroups, dstate)
-    assert C.shape == B.shape
-    if D is not None:
-        assert D.shape == (nheads, dim)
-    if z is not None:
-        assert z.shape == x.shape
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads, dim)
-        dt = dt + dt_bias
-    dt = F.softplus(dt) if dt_softplus else dt
-    dA = torch.exp(rearrange(dt, "b h d -> b h d 1") * A)  # (batch, nheads, dim, dstate)
-    B = repeat(B, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
-    C = repeat(C, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
-    dB = rearrange(dt, "b h d -> b h d 1") * rearrange(B, "b h n -> b h 1 n")  # (batch, nheads, dim, dstate)
-    state.copy_(state * dA + dB * rearrange(x, "b h d -> b h d 1"))  # (batch, dim, dstate
-    out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
-    if D is not None:
-        out += (x * D).to(out.dtype)
-    out = (out if z is None else out * F.silu(z)).to(x.dtype)
-    if not has_heads:
-        out = out.squeeze(1)
-    return out

build/torch210-cxx11-cu126-x86_64-linux/ops/triton/softplus.py DELETED Viewed

@@ -1,15 +0,0 @@
-import triton
-import triton.language as tl
-from packaging import version
-TRITON3 = version.parse(triton.__version__) >= version.parse("3.0.0")
-if TRITON3:
-    @triton.jit
-    def softplus(dt):
-        return tl.math.log(tl.math.exp(dt) + 1)
-else:
-    @triton.jit
-    def softplus(dt):
-        return tl.math.log1p(tl.exp(dt))

build/torch210-cxx11-cu126-x86_64-linux/ops/triton/ssd_bmm.py DELETED Viewed

@@ -1,262 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-def init_to_zero(names):
-    return lambda nargs: [nargs[name].zero_() for name in names if nargs[name] is not None]
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=2),
-    ],
-    key=['chunk_size', 'K', 'IS_CAUSAL'],
-)
-@triton.jit
-def _bmm_chunk_fwd_kernel(
-    # Pointers to matrices
-    a_ptr, b_ptr, out_ptr, seq_idx_ptr,
-    # Matrix dimensions
-    seqlen, chunk_size, K, ngroups,
-    stride_a_batch, stride_a_seqlen, stride_a_head, stride_ak,
-    stride_b_batch, stride_b_seqlen, stride_b_head, stride_bk,
-    stride_out_batch, stride_out_chunk, stride_out_head, stride_outm, stride_outn,
-    stride_seq_idx_batch, stride_seq_idx_seqlen,
-    # Meta-parameters
-    IS_CAUSAL: tl.constexpr,
-    dot_dtype: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=1)
-    pid_ch = tl.program_id(axis=2)
-    pid_c = pid_ch // ngroups
-    pid_h = pid_ch - pid_c * ngroups
-    num_pid_n = tl.cdiv(chunk_size, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    if IS_CAUSAL:
-        if pid_n * BLOCK_SIZE_N >= (pid_m + 1) * BLOCK_SIZE_M:
-            return
-    a_ptr += pid_b * stride_a_batch + pid_c * chunk_size * stride_a_seqlen + pid_h * stride_a_head
-    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + pid_h * stride_b_head
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_m[:, None] * stride_a_seqlen + offs_k[None, :] * stride_ak)
-    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_b_seqlen)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl.load(a_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < K - k * BLOCK_SIZE_K), other=0.0).to(dot_dtype)
-        b = tl.load(b_ptrs, mask=(offs_k[:, None] < K - k * BLOCK_SIZE_K) & (offs_n[None, :] < chunk_size_limit), other=0.0).to(dot_dtype)
-        acc += tl.dot(a, b)
-        a_ptrs += BLOCK_SIZE_K * stride_ak
-        b_ptrs += BLOCK_SIZE_K * stride_bk
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    if HAS_SEQ_IDX:
-        chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
-        seq_idx_n = tl.load(seq_idx_ptr + offs_n * stride_seq_idx_seqlen, mask=offs_n < chunk_size_limit, other=-2)
-        acc = tl.where(seq_idx_m[:, None] == seq_idx_n[None, :], acc, 0.0)
-    out = acc.to(out_ptr.dtype.element_ty)
-    out_ptr += pid_b * stride_out_batch + pid_c * stride_out_chunk + pid_h * stride_out_head
-    out_ptrs = out_ptr + (stride_outm * offs_m[:, None] + offs_n[None, :] * stride_outn)
-    tl.store(out_ptrs, out, mask=(offs_m[:, None] < chunk_size) & (offs_n[None, :] < chunk_size))
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_CS': 64}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_CS': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_CS': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_CS': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_CS': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_CS': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_CS': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_CS': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_CS': 32}, num_stages=4, num_warps=2),
-    ],
-    key=['chunk_size', 'K'],
-)
-@triton.jit
-def _bmm_chunk_bwd_kernel(
-    # Pointers to matrices
-    a_ptr, dout_ptr, db_ptr, res_ptr,
-    # Matrix dimensions
-    seqlen, chunk_size, K, ngroups,
-    stride_a_batch, stride_a_seqlen, stride_a_head, stride_ak,
-    stride_dout_batch, stride_dout_chunk, stride_dout_head, stride_dout_csize_m, stride_dout_csize_n,
-    stride_db_batch, stride_db_seqlen, stride_db_head, stride_db_k,
-    stride_res_batch, stride_res_seqlen, stride_res_head, stride_res_k,
-    # Meta-parameters
-    dot_dtype: tl.constexpr,
-    HAS_RESIDUAL: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_CS: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=1)
-    pid_ch = tl.program_id(axis=2)
-    pid_c = pid_ch // ngroups
-    pid_h = pid_ch - pid_c * ngroups
-    num_pid_n = tl.cdiv(K, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    a_ptr += pid_b * stride_a_batch + pid_c * chunk_size * stride_a_seqlen + pid_h * stride_a_head
-    dout_ptr += pid_b * stride_dout_batch + pid_c * stride_dout_chunk + pid_h * stride_dout_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_cs = tl.arange(0, BLOCK_SIZE_CS)
-    dout_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_csize_n + offs_cs[None, :] * stride_dout_csize_m)
-    a_ptrs = a_ptr + (offs_cs[:, None] * stride_a_seqlen + offs_n[None, :] * stride_ak)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for cs in range(0, tl.cdiv(chunk_size_limit, BLOCK_SIZE_CS)):
-        dout = tl.load(dout_ptrs, mask=(offs_m[:, None] < chunk_size) & (offs_cs[None, :] < chunk_size_limit - cs * BLOCK_SIZE_CS), other=0.0).to(dot_dtype)
-        a = tl.load(a_ptrs, mask=(offs_cs[:, None] < chunk_size_limit - cs * BLOCK_SIZE_CS) & (offs_n[None, :] < K), other=0.0).to(dot_dtype)
-        acc += tl.dot(dout, a)
-        dout_ptrs += BLOCK_SIZE_CS * stride_dout_csize_m
-        a_ptrs += BLOCK_SIZE_CS * stride_a_seqlen
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    if HAS_RESIDUAL:
-        res_ptr += pid_b * stride_res_batch + pid_c * chunk_size * stride_res_seqlen + pid_h * stride_res_head
-        res_ptrs = res_ptr + (offs_m[:, None] * stride_res_seqlen + offs_n[None, :] * stride_res_k)
-        res = tl.load(res_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < K)).to(tl.float32)
-        acc += res
-    db = acc.to(db_ptr.dtype.element_ty)
-    db_ptr += pid_b * stride_db_batch + pid_c * chunk_size * stride_db_seqlen + pid_h * stride_db_head
-    db_ptrs = db_ptr + (offs_m[:, None] * stride_db_seqlen + offs_n[None, :] * stride_db_k)
-    tl.store(db_ptrs, db, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < K))
-def _bmm_chunk_fwd(a, b, chunk_size, seq_idx=None, causal=False, output_dtype=None):
-    """
-    Argument:
-        a: (batch, seqlen, k) or (batch, seqlen, ngroups, k)
-        b: (batch, seqlen, k) or (batch, seqlen, ngroups, k)
-        seq_idx: (batch, seqlen) or None. out[i, j] for seq_idx[i] != seq_idx[j] will be zeroed out.
-        causal: if True, then out[i, j] for i > j will be arbitrary, only out[i, j] for i <= j are
-            guaranteed to be correct.
-    Return:
-        out: (batch, nchunks, chunk_size, chunk_size) or (batch, nchunks, ngroups, chunk_size, chunk_size)
-    """
-    # Check constraints.
-    has_groups = a.dim() == 4
-    if not has_groups:
-        batch, seqlen, k = a.shape
-    else:
-        batch, seqlen, ngroups, k = a.shape
-    assert b.shape == a.shape
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if a.stride(-1) != 1 and a.stride(1) != 1:
-        a = a.contiguous()
-    if b.stride(-1) != 1 and b.stride(1) != 1:
-        b = b.contiguous()
-    nchunks = math.ceil(seqlen / chunk_size)
-    # Allocates output.
-    out_dtype = a.dtype if output_dtype is None else output_dtype
-    out = torch.empty((batch, nchunks, chunk_size, chunk_size) if not has_groups else (batch, nchunks, ngroups, chunk_size, chunk_size),
-                      device=a.device, dtype=out_dtype)
-    dot_dtype = (tl.bfloat16 if a.dtype == torch.bfloat16 or b.dtype == torch.bfloat16 else
-                 (tl.float16 if a.dtype == torch.float16 or b.dtype == torch.float16 else tl.float32))
-    grid = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(chunk_size, META['BLOCK_SIZE_N']),
-                    batch, nchunks if not has_groups else nchunks * ngroups)
-    with torch.cuda.device(a.device.index):
-        _bmm_chunk_fwd_kernel[grid](
-            a, b, out, seq_idx,
-            seqlen, chunk_size, k, ngroups if has_groups else 1,
-            a.stride(0), a.stride(1), 0 if not has_groups else a.stride(2), a.stride(-1),
-            b.stride(0), b.stride(1), 0 if not has_groups else b.stride(2), b.stride(-1),
-            out.stride(0), out.stride(1), 0 if not has_groups else out.stride(2), out.stride(-2), out.stride(-1),
-            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
-            causal,
-            dot_dtype,
-            HAS_SEQ_IDX=seq_idx is not None,
-        )
-    return out
-def _bmm_chunk_bwd(a, dout, residual=None, out=None):
-    """
-    Argument:
-        a: (batch, seqlen, k) or (batch, seqlen, ngroups, k)
-        dout: (batch, nchunks, chunk_size, chunk_size) or (batch, nchunks, ngroups, chunk_size, chunk_size)
-        residual: (batch, seqlen, k) or (batch, seqlen, ngroups, k)
-    Return:
-        out: (batch, seqlen, k) or (batch, seqlen, ngroups, k)
-    If there was seq_idx in the fwd pass, then dout[i, j] for seq_idx[i] != seq_idx[j] should already be
-    zeroed out before calling this function.
-    """
-    # Check constraints.
-    has_groups = a.dim() == 4
-    if not has_groups:
-        batch, seqlen, k = a.shape
-    else:
-        batch, seqlen, ngroups, k = a.shape
-    nchunks, chunk_size = dout.shape[1], dout.shape[-1]
-    if a.stride(-1) != 1 and a.stride(-2) != 1:
-        a = a.contiguous()
-    if dout.stride(-1) != 1 and dout.stride(-2) != 1:
-        dout = dout.contiguous()
-    if residual is not None:
-        assert residual.shape == (batch, seqlen, k) if not has_groups else (batch, seqlen, ngroups, k)
-        if residual.stride(-1) != 1 and residual.stride(1) != 1:
-            residual = residual.contiguous()
-    # Allocates output.
-    if out is not None:
-        assert out.shape == a.shape
-        assert out.stride(-1) == 1 or out.stride(1) == 1
-    else:
-        out = torch.empty_like(a)
-    dot_dtype = (tl.bfloat16 if a.dtype == torch.bfloat16 or dout.dtype == torch.bfloat16 else
-                 (tl.float16 if a.dtype == torch.float16 or dout.dtype == torch.float16 else tl.float32))
-    grid = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(k, META['BLOCK_SIZE_N']), batch,
-                    nchunks if not has_groups else nchunks * ngroups)
-    residual_strides = ((residual.stride(0), residual.stride(1), 0 if not has_groups else residual.stride(2),
-                         residual.stride(-1))
-                        if residual is not None else (0, 0, 0, 0))
-    with torch.cuda.device(a.device.index):
-        _bmm_chunk_bwd_kernel[grid](
-            a, dout, out, residual,
-            seqlen, chunk_size, k, ngroups if has_groups else 1,
-            a.stride(0), a.stride(1), 0 if not has_groups else a.stride(2), a.stride(-1),
-            dout.stride(0), dout.stride(1), 0 if not has_groups else dout.stride(2), dout.stride(-2), dout.stride(-1),
-            out.stride(0), out.stride(1), 0 if not has_groups else out.stride(2), out.stride(-1),
-            residual_strides[0], residual_strides[1], residual_strides[2], residual_strides[3],
-            dot_dtype,
-            HAS_RESIDUAL=residual is not None,
-        )
-    return out

build/torch210-cxx11-cu126-x86_64-linux/ops/triton/ssd_chunk_scan.py DELETED Viewed

The diff for this file is too large to render. See raw diff

build/torch210-cxx11-cu126-x86_64-linux/ops/triton/ssd_chunk_state.py DELETED Viewed

@@ -1,997 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-from .softplus import softplus
-def init_to_zero(names):
-    return lambda nargs: [nargs[name].zero_() for name in names if nargs[name] is not None]
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_H': 1}),
-        triton.Config({'BLOCK_SIZE_H': 2}),
-        triton.Config({'BLOCK_SIZE_H': 4}),
-        triton.Config({'BLOCK_SIZE_H': 8}),
-        triton.Config({'BLOCK_SIZE_H': 16}),
-        triton.Config({'BLOCK_SIZE_H': 32}),
-        triton.Config({'BLOCK_SIZE_H': 64}),
-    ],
-    key=['chunk_size', 'nheads'],
-)
-@triton.jit
-def _chunk_cumsum_fwd_kernel(
-    # Pointers to matrices
-    dt_ptr, A_ptr, dt_bias_ptr, dt_out_ptr, dA_cumsum_ptr,
-    # Matrix dimension
-    batch, seqlen, nheads, chunk_size,
-    dt_min, dt_max,
-    # Strides
-    stride_dt_batch, stride_dt_seqlen, stride_dt_head,
-    stride_A_head,
-    stride_dt_bias_head,
-    stride_dt_out_batch, stride_dt_out_chunk, stride_dt_out_head, stride_dt_out_csize,
-    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    BLOCK_SIZE_H: tl.constexpr, BLOCK_SIZE_CHUNK: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=0)
-    pid_c = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
-    dt_out_ptr += pid_b * stride_dt_out_batch + pid_c * stride_dt_out_chunk
-    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk
-    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
-    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
-    dt_ptrs = dt_ptr + (offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen)
-    A_ptrs = A_ptr + offs_h * stride_A_head
-    dt_out_ptrs = dt_out_ptr + (offs_h[:, None] * stride_dt_out_head + offs_c[None, :] * stride_dt_out_csize)
-    dA_cs_ptrs = dA_cumsum_ptr + (offs_h[:, None] * stride_dA_cs_head + offs_c[None, :] * stride_dA_cs_csize)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    dt = tl.load(dt_ptrs, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), other=0.0).to(tl.float32)
-    if HAS_DT_BIAS:
-        dt_bias = tl.load(dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0).to(tl.float32)
-        dt += dt_bias[:, None]
-    if DT_SOFTPLUS:
-        dt = tl.where(dt <= 20.0, softplus(dt), dt)
-    # As of Triton 2.2.0, tl.clamp is not available yet
-    # dt = tl.clamp(dt, dt_min, dt_max)
-    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
-    dt = tl.where((offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0)
-    tl.store(dt_out_ptrs, dt, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size))
-    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
-    dA = dt * A[:, None]
-    dA_cs = tl.cumsum(dA, axis=1)
-    tl.store(dA_cs_ptrs, dA_cs, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size))
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_H': 1}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
-        triton.Config({'BLOCK_SIZE_H': 2}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
-        triton.Config({'BLOCK_SIZE_H': 4}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
-        triton.Config({'BLOCK_SIZE_H': 8}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
-        triton.Config({'BLOCK_SIZE_H': 16}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
-        triton.Config({'BLOCK_SIZE_H': 32}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
-        triton.Config({'BLOCK_SIZE_H': 64}, pre_hook=init_to_zero(["dA_ptr", "ddt_bias_ptr"])),
-    ],
-    key=['chunk_size', 'nheads'],
-)
-@triton.jit
-def _chunk_cumsum_bwd_kernel(
-    # Pointers to matrices
-    ddA_ptr, ddt_out_ptr, dt_ptr, A_ptr, dt_bias_ptr,
-    ddt_ptr, dA_ptr, ddt_bias_ptr,
-    # Matrix dimensions
-    batch, seqlen, nheads, chunk_size,
-    dt_min, dt_max,
-    # Strides
-    stride_ddA_batch, stride_ddA_chunk, stride_ddA_head, stride_ddA_csize,
-    stride_ddt_out_batch, stride_ddt_out_chunk, stride_ddt_out_head, stride_ddt_out_csize,
-    stride_dt_batch, stride_dt_seqlen, stride_dt_head,
-    stride_A_head,
-    stride_dt_bias_head,
-    stride_ddt_batch, stride_ddt_seqlen, stride_ddt_head,
-    stride_dA_head,
-    stride_ddt_bias_head,
-    # Meta-parameters
-    DT_SOFTPLUS: tl.constexpr,
-    HAS_DT_BIAS: tl.constexpr,
-    BLOCK_SIZE_H: tl.constexpr, BLOCK_SIZE_CHUNK: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=0)
-    pid_c = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    ddt_out_ptr += pid_b * stride_ddt_out_batch + pid_c * stride_ddt_out_chunk
-    ddA_ptr += pid_b * stride_ddA_batch + pid_c * stride_ddA_chunk
-    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
-    ddt_ptr += pid_b * stride_ddt_batch + pid_c * chunk_size * stride_ddt_seqlen
-    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
-    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
-    ddt_out_ptrs = ddt_out_ptr + (offs_h[:, None] * stride_ddt_out_head + offs_c[None, :] * stride_ddt_out_csize)
-    ddA_ptrs = ddA_ptr + (offs_h[:, None] * stride_ddA_head + offs_c[None, :] * stride_ddA_csize)
-    dt_ptrs = dt_ptr + (offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen)
-    ddt_ptrs = ddt_ptr + (offs_h[:, None] * stride_ddt_head + offs_c[None, :] * stride_ddt_seqlen)
-    A_ptrs = A_ptr + offs_h * stride_A_head
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    ddA = tl.load(ddA_ptrs, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), other=0.0).to(tl.float32)
-    ddt_out = tl.load(ddt_out_ptrs, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), other=0.0).to(tl.float32)
-    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
-    ddt = ddA * A[:, None] + ddt_out
-    dt = tl.load(dt_ptrs, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), other=0.0).to(tl.float32)
-    if HAS_DT_BIAS:
-        dt_bias = tl.load(dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0).to(tl.float32)
-        dt += dt_bias[:, None]
-    if DT_SOFTPLUS:
-        dt_presoftplus = dt
-        dt = tl.where(dt <= 20.0, softplus(dt), dt)
-    clamp_mask = (dt < dt_min) | (dt > dt_max)
-    # As of Triton 2.2.0, tl.clamp is not available yet
-    # dt = tl.clamp(dt, dt_min, dt_max)
-    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
-    dt = tl.where((offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0)
-    ddt = tl.where((offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), ddt, 0.0)
-    ddt = tl.where(clamp_mask, 0.0, ddt)
-    if DT_SOFTPLUS:
-        ddt = tl.where(dt_presoftplus <= 20.0, ddt * tl.sigmoid(dt_presoftplus), ddt)
-    tl.store(ddt_ptrs, ddt, mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit))
-    dA = tl.sum(ddA * dt, axis=1)
-    tl.atomic_add(dA_ptr + offs_h * stride_dA_head, dA, mask=offs_h < nheads)
-    if HAS_DT_BIAS:
-        ddt_bias = tl.sum(ddt, axis=1)
-        tl.atomic_add(ddt_bias_ptr + offs_h * stride_ddt_bias_head, ddt_bias, mask=offs_h < nheads)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=2),
-    ],
-    key=['hdim', 'dstate', 'chunk_size'],
-)
-@triton.jit
-def _chunk_state_fwd_kernel(
-    # Pointers to matrices
-    x_ptr, b_ptr, states_ptr, dt_ptr, dA_cumsum_ptr, seq_idx_ptr,
-    # Matrix dimensions
-    hdim, dstate, chunk_size,
-    batch, seqlen, nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
-    stride_b_batch, stride_b_seqlen, stride_b_head, stride_b_dstate,
-    stride_states_batch, stride_states_chunk, stride_states_head, stride_states_hdim, stride_states_dstate,
-    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
-    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
-    stride_seq_idx_batch, stride_seq_idx_seqlen,
-    # Meta-parameters
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen)
-    b_ptrs = b_ptr + (offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen)
-    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    if HAS_SEQ_IDX:
-        seq_idx_ptrs = seq_idx_ptr + offs_k * stride_seq_idx_seqlen
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    if HAS_SEQ_IDX:
-        seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(x_ptrs, mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k), other=0.0)
-        b = tl.load(b_ptrs, mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate), other=0.0).to(tl.float32)
-        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(tl.float32)
-        if HAS_SEQ_IDX:
-            seq_idx_k = tl.load(seq_idx_ptrs, mask=offs_k < chunk_size_limit - k, other=-1)
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(tl.float32)
-        if not HAS_SEQ_IDX:
-            # scale = tl.exp((dA_cs_last - dA_cs_k)) * dt_k
-            scale = tl.exp(tl.minimum((dA_cs_last - dA_cs_k), 0.0)) * dt_k
-        else:
-            # scale = tl.where(seq_idx_k == seq_idx_last, tl.exp((dA_cs_last - dA_cs_k)) * dt_k, 0.0)
-            scale = tl.where((seq_idx_last >= 0) & (seq_idx_k == seq_idx_last), tl.exp(tl.minimum((dA_cs_last - dA_cs_k), 0.0)) * dt_k, 0.0)
-        b *= scale[:, None]
-        b = b.to(x_ptr.dtype.element_ty)
-        acc += tl.dot(x, b)
-        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
-        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
-        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-        if HAS_SEQ_IDX:
-            seq_idx_ptrs += BLOCK_SIZE_K * stride_seq_idx_seqlen
-    states = acc.to(states_ptr.dtype.element_ty)
-    states_ptr += pid_b * stride_states_batch + pid_c * stride_states_chunk + pid_h * stride_states_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate)
-    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
-    tl.store(states_ptrs, states, mask=c_mask)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr", "ddA_cumsum_ptr"])),
-    ],
-    key=['chunk_size', 'hdim', 'dstate'],
-)
-@triton.jit
-def _chunk_state_bwd_dx_kernel(
-    # Pointers to matrices
-    x_ptr, b_ptr, dstates_ptr, dt_ptr, dA_cumsum_ptr,
-    dx_ptr, ddt_ptr, ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size, hdim, dstate,
-    batch, seqlen, nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
-    stride_b_batch, stride_b_seqlen, stride_b_head, stride_b_dstate,
-    stride_dstates_batch, stride_dstates_chunk, stride_states_head, stride_states_hdim, stride_states_dstate,
-    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
-    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
-    stride_dx_batch, stride_dx_seqlen, stride_dx_head, stride_dx_hdim,
-    stride_ddt_batch, stride_ddt_chunk, stride_ddt_head, stride_ddt_csize,
-    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
-    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    dstates_ptr += pid_b * stride_dstates_batch + pid_c * stride_dstates_chunk + pid_h * stride_states_head
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddt_ptr += pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
-    ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + pid_h * stride_ddA_cs_head
-    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    offs_k = tl.arange(0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K)
-    b_ptrs = b_ptr + (offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate)
-    dstates_ptrs = dstates_ptr + (offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate)
-    if BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate), other=0.0)
-        dstates = tl.load(dstates_ptrs, mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim), other=0.0)
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates)
-    else:
-        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate - k), other=0.0)
-            dstates = tl.load(dstates_ptrs, mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim), other=0.0)
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
-    dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    # acc *= tl.exp(dA_cs_last - dA_cs_m)[:, None]
-    acc *= tl.exp(tl.minimum((dA_cs_last - dA_cs_m), 0.0))[:, None]
-    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
-    x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
-    ddt = tl.sum(acc * x, axis=1)
-    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-    ddA_cs = -(ddt * dt_m)
-    ddA_cs_last = -tl.sum(ddA_cs)
-    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
-    tl.atomic_add(ddA_cumsum_ptr + (chunk_size - 1) * stride_ddA_cs_csize, ddA_cs_last)
-    dx = (acc * dt_m[:, None]).to(dx_ptr.dtype.element_ty)
-    dx_ptr += pid_b * stride_dx_batch + pid_c * chunk_size * stride_dx_seqlen + pid_h * stride_dx_head
-    dx_ptrs = dx_ptr + (offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim)
-    tl.store(dx_ptrs, dx, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim))
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-    ],
-    key=['chunk_size', 'dstate', 'hdim'],
-)
-@triton.jit
-def _chunk_state_bwd_db_kernel(
-    # Pointers to matrices
-    x_ptr, dstates_ptr, b_ptr, dt_ptr, dA_cumsum_ptr, seq_idx_ptr,
-    db_ptr, ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size, dstate, hdim,
-    batch, seqlen, nheads, nheads_per_program, ngroups,
-    # Strides
-    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
-    stride_dstates_batch, stride_dstates_chunk, stride_states_head, stride_states_hdim, stride_states_dstate,
-    stride_b_batch, stride_b_seqlen, stride_b_head, stride_b_dstate,
-    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
-    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
-    stride_seq_idx_batch, stride_seq_idx_seqlen,
-    stride_db_batch, stride_db_seqlen, stride_db_split, stride_db_group, stride_db_dstate,
-    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize,
-    # Meta-parameters
-    HAS_DDA_CS: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_sg = tl.program_id(axis=2)
-    pid_s = pid_sg // ngroups
-    pid_g = pid_sg - pid_s * ngroups
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_x_head
-    db_ptr += pid_b * stride_db_batch + pid_c * chunk_size * stride_db_seqlen + pid_g * stride_db_group + pid_s * stride_db_split
-    dstates_ptr += pid_b * stride_dstates_batch + pid_c * stride_dstates_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_states_head
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dt_head
-    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_dA_cs_head
-    if HAS_DDA_CS:
-        b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + pid_g * stride_b_head
-        ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + (pid_g * (nheads // ngroups) + pid_s * nheads_per_program) * stride_ddA_cs_head
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_k[None, :] * stride_x_hdim)
-    dstates_ptrs = dstates_ptr + (offs_n[None, :] * stride_states_dstate + offs_k[:, None] * stride_states_hdim)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_m * stride_dA_cs_csize
-    if HAS_DDA_CS:
-        b_ptrs = b_ptr + (offs_m[:, None] * stride_b_seqlen + offs_n[None, :] * stride_b_dstate)
-        ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    if HAS_DDA_CS:
-        b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate), other=0.0).to(tl.float32)
-    if HAS_SEQ_IDX:
-        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
-        seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
-    nheads_iter = min(nheads_per_program, nheads // ngroups - pid_s * nheads_per_program)
-    for h in range(nheads_iter):
-        x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < hdim), other=0.0)
-        dstates = tl.load(dstates_ptrs, mask=(offs_k[:, None] < hdim) & (offs_n[None, :] < dstate), other=0.0)
-        dstates = dstates.to(x_ptrs.dtype.element_ty)
-        db = tl.dot(x, dstates)
-        dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
-        dA_cs_m = tl.load(dA_cumsum_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-        dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-        if not HAS_SEQ_IDX:
-            # scale = tl.exp(dA_cs_last - dA_cs_m)
-            scale = tl.exp(tl.minimum((dA_cs_last - dA_cs_m), 0.0))
-        else:
-            # scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
-            scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(tl.minimum((dA_cs_last - dA_cs_m), 0.0)), 0.0)
-        db *= (scale * dt_m)[:, None]
-        if HAS_DDA_CS:
-            # This is the gradient wrt (dA_cs_last - dA_cs_m), i.e. the exclusive reverse cumsum
-            ddA_cs = tl.sum(db * b, axis=1)
-            tl.atomic_add(ddA_cumsum_ptrs + stride_ddA_cs_csize, ddA_cs, mask=offs_m < chunk_size - 1)
-        acc += db
-        x_ptrs += stride_x_head
-        dstates_ptrs += stride_states_head
-        dt_ptrs += stride_dt_head
-        dA_cumsum_ptr += stride_dA_cs_head
-        dA_cumsum_ptrs += stride_dA_cs_head
-        if HAS_DDA_CS:
-            ddA_cumsum_ptrs += stride_ddA_cs_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    # if HAS_SEQ_IDX:
-    #     seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
-    #     seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
-    #     acc = tl.where(seq_idx_m[:, None] == seq_idx_last, acc, 0.0)
-    db_ptrs = db_ptr + (offs_m[:, None] * stride_db_seqlen + offs_n[None, :] * stride_db_dstate)
-    tl.store(db_ptrs, acc, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < dstate))
-@triton.autotune(
-    configs=[
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=3, num_warps=4, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-        triton.Config({'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=8, pre_hook=init_to_zero(["ddA_cumsum_ptr"])),
-    ],
-    key=['chunk_size', 'hdim', 'dstate'],
-)
-@triton.jit
-def _chunk_state_bwd_ddAcs_stable_kernel(
-    # Pointers to matrices
-    x_ptr, b_ptr, dstates_ptr, dt_ptr, dA_cumsum_ptr, seq_idx_ptr,
-    ddA_cumsum_ptr,
-    # Matrix dimensions
-    chunk_size, hdim, dstate,
-    batch, seqlen, nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
-    stride_b_batch, stride_b_seqlen, stride_b_head, stride_b_dstate,
-    stride_dstates_batch, stride_dstates_chunk, stride_states_head, stride_states_hdim, stride_states_dstate,
-    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
-    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
-    stride_seq_idx_batch, stride_seq_idx_seqlen,
-    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head, stride_ddA_cs_csize,
-    # Meta-parameters
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
-    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    dstates_ptr += pid_b * stride_dstates_batch + pid_c * stride_dstates_chunk + pid_h * stride_states_head
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddA_cumsum_ptr += pid_b * stride_ddA_cs_batch + pid_c * stride_ddA_cs_chunk + pid_h * stride_ddA_cs_head
-    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    offs_k = tl.arange(0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K)
-    b_ptrs = b_ptr + (offs_m[:, None] * stride_b_seqlen + offs_k[None, :] * stride_b_dstate)
-    dstates_ptrs = dstates_ptr + (offs_n[None, :] * stride_states_hdim + offs_k[:, None] * stride_states_dstate)
-    if BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate), other=0.0)
-        dstates = tl.load(dstates_ptrs, mask=(offs_k[:, None] < dstate) & (offs_n[None, :] < hdim), other=0.0)
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates)
-    else:
-        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_k[None, :] < dstate - k), other=0.0)
-            dstates = tl.load(dstates_ptrs, mask=(offs_k[:, None] < dstate - k) & (offs_n[None, :] < hdim), other=0.0)
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_states_dstate
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
-    if not HAS_SEQ_IDX:
-        # scale = tl.exp(dA_cs_last - dA_cs_m)
-        scale = tl.exp(tl.minimum((dA_cs_last - dA_cs_m), 0.0))
-    else:
-        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
-        seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
-        # scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
-        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(tl.minimum((dA_cs_last - dA_cs_m), 0.0)), 0.0)
-    acc *= scale[:, None]
-    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
-    x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size, other=0.0).to(tl.float32)
-    ddt = tl.sum(acc * x, axis=1)
-    # ddA_cs = -(ddt * dt_m)
-    # Triton 2.2.0 errors if we have the cumsum here, so we just write it out
-    # then call torch.cumsum outside this kernel.
-    # ddA_cs = tl.cumsum(ddt * dt_m)
-    ddA_cs = ddt * dt_m
-    ddA_cumsum_ptrs = ddA_cumsum_ptr + offs_m * stride_ddA_cs_csize
-    # tl.atomic_add(ddA_cumsum_ptrs, ddA_cs, mask=offs_m < chunk_size)
-    tl.atomic_add(ddA_cumsum_ptrs + stride_ddA_cs_csize, ddA_cs, mask=offs_m < chunk_size - 1)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=2),
-    ],
-    key=['hdim', 'dstate', 'chunk_size'],
-)
-@triton.jit
-def _chunk_state_varlen_kernel(
-    # Pointers to matrices
-    x_ptr, b_ptr, dt_ptr, dA_cumsum_ptr, chunk_states_ptr, cu_seqlens_ptr, states_ptr,
-    # Matrix dimensions
-    hdim, dstate, chunk_size,
-    seqlen, nheads_ngroups_ratio,
-    # Strides
-    stride_x_seqlen, stride_x_head, stride_x_hdim,
-    stride_b_seqlen, stride_b_head, stride_b_dstate,
-    stride_dt_chunk, stride_dt_head, stride_dt_csize,
-    stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
-    stride_chunk_states_chunk, stride_chunk_states_head, stride_chunk_states_hdim, stride_chunk_states_dstate,
-    stride_states_batch, stride_states_head, stride_states_hdim, stride_states_dstate,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    end_idx = tl.load(cu_seqlens_ptr + pid_b + 1)
-    pid_c = (end_idx - 1) // chunk_size
-    b_ptr += pid_c * chunk_size * stride_b_seqlen + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
-    dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
-    chunk_states_ptr += pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen)
-    b_ptrs = b_ptr + (offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen)
-    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
-    dA_cs_last = tl.load(dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    chunk_size_limit = end_idx - pid_c * chunk_size
-    start_idx = tl.load(cu_seqlens_ptr + pid_b)
-    start_idx_cur = tl.maximum(start_idx - pid_c * chunk_size, 0)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
-        x = tl.load(x_ptrs, mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k) & (offs_k[None, :] >= start_idx_cur - k), other=0.0)
-        b = tl.load(b_ptrs, mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate) & (offs_k[:, None] >= start_idx_cur - k), other=0.0).to(tl.float32)
-        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(tl.float32)
-        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(tl.float32)
-        # scale = tl.where((offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k),
-        #                  tl.exp((dA_cs_last - dA_cs_k)) * dt_k, 0.0)
-        scale = tl.where((offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k),
-                         tl.exp(tl.minimum((dA_cs_last - dA_cs_k), 0.0)) * dt_k, 0.0)
-        b *= scale[:, None]
-        b = b.to(x_ptr.dtype.element_ty)
-        acc += tl.dot(x, b)
-        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
-        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
-        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-    # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk
-    if start_idx < pid_c * chunk_size:
-        chunk_states_ptrs = chunk_states_ptr + (offs_m[:, None] * stride_chunk_states_hdim + offs_n[None, :] * stride_chunk_states_dstate)
-        chunk_states = tl.load(chunk_states_ptrs, mask=(offs_m[:, None] < hdim) & (offs_n[None, :] < dstate), other=0.0).to(tl.float32)
-        # scale = tl.where(start_idx < pid_c * chunk_size, tl.exp(dA_cs_last), 0.0)
-        scale = tl.exp(dA_cs_last)
-        acc += chunk_states * scale
-    states = acc.to(states_ptr.dtype.element_ty)
-    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    states_ptrs = states_ptr + (offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate)
-    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
-    tl.store(states_ptrs, states, mask=c_mask)
-def _chunk_cumsum_fwd(dt, A, chunk_size, dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf"))):
-    batch, seqlen, nheads = dt.shape
-    assert A.shape == (nheads,)
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads,)
-    nchunks = math.ceil(seqlen / chunk_size)
-    dt_out = torch.empty(batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32)
-    dA_cumsum = torch.empty(batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32)
-    grid_chunk_cs = lambda META: (batch, nchunks, triton.cdiv(nheads, META['BLOCK_SIZE_H']))
-    with torch.cuda.device(dt.device.index):
-        _chunk_cumsum_fwd_kernel[grid_chunk_cs](
-            dt, A, dt_bias, dt_out, dA_cumsum,
-            batch, seqlen, nheads, chunk_size,
-            dt_limit[0], dt_limit[1],
-            dt.stride(0), dt.stride(1), dt.stride(2),
-            A.stride(0),
-            dt_bias.stride(0) if dt_bias is not None else 0,
-            dt_out.stride(0), dt_out.stride(2), dt_out.stride(1), dt_out.stride(3),
-            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
-            dt_softplus,
-            HAS_DT_BIAS=dt_bias is not None,
-            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
-        )
-    return dA_cumsum, dt_out
-def _chunk_cumsum_bwd(ddA, ddt_out, dt, A, dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf")), ddt=None):
-    batch, seqlen, nheads = dt.shape
-    _, _, nchunks, chunk_size = ddA.shape
-    assert ddA.shape == (batch, nheads, nchunks, chunk_size)
-    assert ddt_out.shape == (batch, nheads, nchunks, chunk_size)
-    assert A.shape == (nheads,)
-    if dt_bias is not None:
-        assert dt_bias.shape == (nheads,)
-        ddt_bias = torch.empty_like(dt_bias, dtype=torch.float32)
-    else:
-        ddt_bias = None
-    if ddt is not None:
-        assert ddt.shape == dt.shape
-    else:
-        ddt = torch.empty_like(dt)
-    dA = torch.empty_like(A, dtype=torch.float32)
-    grid_chunk_cs = lambda META: (batch, nchunks, triton.cdiv(nheads, META['BLOCK_SIZE_H']))
-    with torch.cuda.device(dt.device.index):
-        _chunk_cumsum_bwd_kernel[grid_chunk_cs](
-            ddA, ddt_out, dt, A, dt_bias, ddt, dA, ddt_bias,
-            batch, seqlen, nheads, chunk_size,
-            dt_limit[0], dt_limit[1],
-            ddA.stride(0), ddA.stride(2), ddA.stride(1), ddA.stride(3),
-            ddt_out.stride(0), ddt_out.stride(2), ddt_out.stride(1), ddt_out.stride(3),
-            dt.stride(0), dt.stride(1), dt.stride(2),
-            A.stride(0),
-            dt_bias.stride(0) if dt_bias is not None else 0,
-            ddt.stride(0), ddt.stride(1), ddt.stride(2),
-            dA.stride(0),
-            ddt_bias.stride(0) if ddt_bias is not None else 0,
-            dt_softplus,
-            HAS_DT_BIAS=dt_bias is not None,
-            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
-        )
-    return ddt, dA, ddt_bias
-def _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=None, states=None, states_in_fp32=True):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if states is not None:
-        assert states.shape == (batch, nchunks, nheads, headdim, dstate)
-    else:
-        states_dtype = torch.float32 if states_in_fp32 else B.dtype
-        states = torch.empty((batch, nchunks, nheads, headdim, dstate), device=x.device, dtype=states_dtype)
-    grid = lambda META: (triton.cdiv(headdim, META['BLOCK_SIZE_M']) * triton.cdiv(dstate, META['BLOCK_SIZE_N']),
-                    batch * nchunks, nheads)
-    with torch.cuda.device(x.device.index):
-        _chunk_state_fwd_kernel[grid](
-            x, B, states, dt, dA_cumsum, seq_idx,
-            headdim, dstate, chunk_size,
-            batch, seqlen, nheads // ngroups,
-            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
-            B.stride(0), B.stride(1), B.stride(2), B.stride(-1),
-            states.stride(0), states.stride(1), states.stride(2), states.stride(3), states.stride(4),
-            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
-            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
-            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
-            HAS_SEQ_IDX=seq_idx is not None,
-        )
-    return states
-def _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates, dx=None):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if dx is not None:
-        assert dx.shape == x.shape
-    else:
-        dx = torch.empty_like(x)
-    ddt = torch.empty(batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32)
-    ddA_cumsum = torch.empty(batch, nheads, nchunks, chunk_size, device=dA_cumsum.device, dtype=torch.float32)
-    grid_dx = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(headdim, META['BLOCK_SIZE_N']),
-                       batch * nchunks, nheads)
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_dx_kernel[grid_dx](
-            x, B, dstates, dt, dA_cumsum, dx, ddt, ddA_cumsum,
-            chunk_size, headdim, dstate,
-            batch, seqlen, nheads // ngroups,
-            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
-            B.stride(0), B.stride(1), B.stride(2), B.stride(-1),
-            dstates.stride(0), dstates.stride(1), dstates.stride(2), dstates.stride(3), dstates.stride(4),
-            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
-            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
-            dx.stride(0), dx.stride(1), dx.stride(2), dx.stride(3),
-            ddt.stride(0), ddt.stride(2), ddt.stride(1), ddt.stride(3),
-            ddA_cumsum.stride(0), ddA_cumsum.stride(2), ddA_cumsum.stride(1), ddA_cumsum.stride(3),
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-        )
-    return dx, ddt.to(dt.dtype), ddA_cumsum.to(dA_cumsum.dtype)
-def _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=None, B=None, ngroups=1):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    dstate = dstates.shape[-1]
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if B is not None:
-        assert B.shape == (batch, seqlen, ngroups, dstate)
-        B_strides = (B.stride(0), B.stride(1), B.stride(2), B.stride(3))
-        # Use torch.empty since the Triton kernel will call init_to_zero
-        ddA_cumsum = torch.empty(batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32)
-        ddA_cumsum_strides = (ddA_cumsum.stride(0), ddA_cumsum.stride(2), ddA_cumsum.stride(1), ddA_cumsum.stride(3))
-    else:
-        B_strides = (0, 0, 0, 0)
-        ddA_cumsum = None
-        ddA_cumsum_strides = (0, 0, 0, 0)
-    nheads_ngroups_ratio = nheads // ngroups
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
-    nheads_per_program = max(min(math.ceil(batch * nchunks * nheads / sm_count), nheads_ngroups_ratio), 1)
-    nsplits = triton.cdiv(nheads_ngroups_ratio, nheads_per_program)
-    dB = torch.empty(batch, seqlen, nsplits, ngroups, dstate, device=x.device, dtype=torch.float32)
-    grid_db = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(dstate, META['BLOCK_SIZE_N']),
-                        batch * nchunks, nsplits * ngroups)
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_db_kernel[grid_db](
-            x, dstates, B, dt, dA_cumsum, seq_idx, dB, ddA_cumsum,
-            chunk_size, dstate, headdim,
-            batch, seqlen, nheads, nheads_per_program, ngroups,
-            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
-            dstates.stride(0), dstates.stride(1), dstates.stride(2), dstates.stride(3), dstates.stride(4),
-            *B_strides,
-            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
-            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
-            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
-            dB.stride(0), dB.stride(1), dB.stride(2), dB.stride(3), dB.stride(4),
-            *ddA_cumsum_strides,
-            HAS_DDA_CS=ddA_cumsum is not None,
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_K=max(triton.next_power_of_2(headdim), 16),
-        )
-    dB = dB.sum(2)
-    if ddA_cumsum is not None:
-        # The first element of ddA_cumsum is always zero, since that dA_cumsum does not contribute
-        # to the state of the chunk.
-        # torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
-        # But it's easier to just do the cumsum for all elements, the result will be the same.
-        torch.cumsum(ddA_cumsum, dim=-1, out=ddA_cumsum)
-    return dB if B is None else (dB, ddA_cumsum)
-def _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=None):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    # Use torch.empty since the Triton kernel will call init_to_zero
-    ddA_cumsum = torch.empty(batch, nheads, nchunks, chunk_size, device=x.device, dtype=torch.float32)
-    grid_ddtcs = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(headdim, META['BLOCK_SIZE_N']),
-                          batch * nchunks, nheads)
-    with torch.cuda.device(x.device.index):
-        _chunk_state_bwd_ddAcs_stable_kernel[grid_ddtcs](
-            x, B, dstates, dt, dA_cumsum, seq_idx, ddA_cumsum,
-            chunk_size, headdim, dstate,
-            batch, seqlen, nheads // ngroups,
-            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
-            B.stride(0), B.stride(1), B.stride(2), B.stride(-1),
-            dstates.stride(0), dstates.stride(1), dstates.stride(2), dstates.stride(3), dstates.stride(4),
-            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
-            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
-            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
-            ddA_cumsum.stride(0), ddA_cumsum.stride(2), ddA_cumsum.stride(1), ddA_cumsum.stride(3),
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_M=max(triton.next_power_of_2(chunk_size), 16),
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-        )
-    torch.cumsum(ddA_cumsum[..., 1:], dim=-1, out=ddA_cumsum[..., 1:])
-    return ddA_cumsum
-def chunk_state_varlen(B, x, dt, dA_cumsum, cu_seqlens, chunk_states):
-    total_seqlen, nheads, headdim = x.shape
-    _, nchunks, chunk_size = dt.shape
-    _, ngroups, dstate = B.shape
-    batch = cu_seqlens.shape[0] - 1
-    cu_seqlens = cu_seqlens.contiguous()
-    assert nheads % ngroups == 0
-    assert B.shape == (total_seqlen, ngroups, dstate)
-    assert dt.shape == (nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert chunk_states.shape == (nchunks, nheads, headdim, dstate)
-    states = torch.empty(batch, nheads, headdim, dstate, dtype=chunk_states.dtype, device=chunk_states.device)
-    grid = lambda META: (triton.cdiv(headdim, META['BLOCK_SIZE_M']) * triton.cdiv(dstate, META['BLOCK_SIZE_N']),
-                    batch, nheads)
-    with torch.cuda.device(x.device.index):
-        _chunk_state_varlen_kernel[grid](
-            x, B, dt, dA_cumsum, chunk_states, cu_seqlens, states,
-            headdim, dstate, chunk_size,
-            total_seqlen, nheads // ngroups,
-            x.stride(0), x.stride(1), x.stride(2),
-            B.stride(0), B.stride(1), B.stride(2),
-            dt.stride(1), dt.stride(0), dt.stride(2),
-            dA_cumsum.stride(1), dA_cumsum.stride(0), dA_cumsum.stride(2),
-            chunk_states.stride(0), chunk_states.stride(1), chunk_states.stride(2), chunk_states.stride(3),
-            states.stride(0), states.stride(1), states.stride(2), states.stride(3),
-        )
-    return states
-class ChunkStateFn(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, B, x, dt, dA_cumsum, states_in_fp32=True):
-        batch, seqlen, nheads, headdim = x.shape
-        _, _, nchunks, chunk_size = dt.shape
-        assert seqlen <= nchunks * chunk_size
-        _, _, ngroups, dstate = B.shape
-        assert B.shape == (batch, seqlen, ngroups, dstate)
-        assert dt.shape == (batch, nheads, nchunks, chunk_size)
-        assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
-        if B.stride(-1) != 1:
-            B = B.contiguous()
-        if x.stride(-1) != 1 and x.stride(1) != 1:  # Either M or K dimension should be contiguous
-            x = x.contiguous()
-        states = _chunk_state_fwd(B, x, dt, dA_cumsum, states_in_fp32=states_in_fp32)
-        ctx.save_for_backward(B, x, dt, dA_cumsum)
-        return states
-    @staticmethod
-    def backward(ctx, dstates):
-        B, x, dt, dA_cumsum = ctx.saved_tensors
-        batch, seqlen, nheads, headdim = x.shape
-        _, _, nchunks, chunk_size = dt.shape
-        _, _, ngroups, dstate = B.shape
-        assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-        if dstates.stride(-1) != 1:
-            dstates = dstates.contiguous()
-        dx, ddt, ddA_cumsum = _chunk_state_bwd_dx(B, x, dt, dA_cumsum, dstates)
-        dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, ngroups=ngroups)
-        dB = dB.to(B.dtype)
-        return dB, dx, ddt, ddA_cumsum, None
-def chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True):
-    """
-    Argument:
-        B: (batch, seqlen, ngroups, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size)
-        dA_cumsum: (batch, nheads, nchunks, chunk_size)
-    Return:
-        states: (batch, nchunks, nheads, headdim, dstate)
-    """
-    return ChunkStateFn.apply(B, x, dt, dA_cumsum, states_in_fp32)
-def chunk_state_ref(B, x, dt, dA_cumsum):
-    """
-    Argument:
-        B: (batch, seqlen, ngroups, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size)
-        dA_cumsum: (batch, nheads, nchunks, chunk_size)
-    Return:
-        states: (batch, nchunks, nheads, headdim, dstate)
-    """
-    # Check constraints.
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    _, _, nchunks, chunk_size = dt.shape
-    assert seqlen <= nchunks * chunk_size
-    assert x.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    ngroups = B.shape[2]
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    B = repeat(B, "b l g d -> b l (g h) d", h=nheads // ngroups)
-    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
-    if seqlen < nchunks * chunk_size:
-        x = F.pad(x, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
-        B = F.pad(B, (0, 0, 0, 0, 0, nchunks * chunk_size - seqlen))
-    x = rearrange(x, "b (c l) h p -> b c l h p", l=chunk_size)
-    B = rearrange(B, "b (c l) ... -> b c l ...", l=chunk_size)
-    decay_states = torch.exp((dA_cumsum[:, :, :, -1:] - dA_cumsum))
-    return torch.einsum("bclhn,bhcl,bhcl,bclhp->bchpn", B.to(x.dtype), decay_states.to(x.dtype), dt.to(x.dtype), x)

build/torch210-cxx11-cu126-x86_64-linux/ops/triton/ssd_combined.py DELETED Viewed

@@ -1,998 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-from typing import Optional
-import math
-from packaging import version
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-from ...utils.torch import custom_bwd, custom_fwd
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn
-    from causal_conv1d.causal_conv1d_interface import causal_conv1d_cuda
-except ImportError:
-    causal_conv1d_fn = None
-    causal_conv1d_cuda = None
-from .ssd_bmm import _bmm_chunk_fwd, _bmm_chunk_bwd
-from .ssd_chunk_state import _chunk_cumsum_fwd, _chunk_cumsum_bwd
-from .ssd_chunk_state import _chunk_state_fwd, _chunk_state_bwd_db
-from .ssd_chunk_state import _chunk_state_bwd_ddAcs_stable
-from .ssd_chunk_state import chunk_state, chunk_state_ref
-from .ssd_chunk_state import chunk_state_varlen
-from .ssd_state_passing import _state_passing_fwd, _state_passing_bwd
-from .ssd_state_passing import state_passing, state_passing_ref
-from .ssd_chunk_scan import _chunk_scan_fwd, _chunk_scan_bwd_dz, _chunk_scan_bwd_dstates
-from .ssd_chunk_scan import _chunk_scan_bwd_dC, _chunk_scan_bwd_dcb
-from .ssd_chunk_scan import _chunk_scan_bwd_ddAcs_stable
-from .ssd_chunk_scan import chunk_scan, chunk_scan_ref
-from .ssd_chunk_scan import _chunk_scan_bwd_ddAcs_prev
-from .layernorm_gated import rmsnorm_fn, _layer_norm_fwd, _layer_norm_bwd
-from .k_activations import _swiglu_fwd, _swiglu_bwd
-TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0")
-def init_to_zero(names):
-    return lambda nargs: [nargs[name].zero_() for name in names if nargs[name] is not None]
-def rearrange_and_update_stride(tensor, pattern=None, dim=2):
-    # ensure tensor.stride(dim) is a multiple of eight after rearranging according to pattern,
-    # if not call contiguous(), rearrange only if pattern is not None
-    tensor_rearranged = rearrange(tensor, pattern) if pattern is not None else tensor
-    return tensor_rearranged.contiguous() if tensor_rearranged.stride(dim) % 8 != 0 else tensor_rearranged
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64}, num_stages=3, num_warps=8, pre_hook=init_to_zero(["ddt_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=5, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32}, num_stages=4, num_warps=4, pre_hook=init_to_zero(["ddt_ptr"])),
-    ],
-    key=['chunk_size', 'hdim', 'dstate'],
-)
-@triton.jit
-def _chunk_scan_chunk_state_bwd_dx_kernel(
-    # Pointers to matrices
-    x_ptr, cb_ptr, dout_ptr, dt_ptr, dA_cumsum_ptr, seq_idx_ptr, D_ptr,
-    b_ptr, dstates_ptr,
-    dx_ptr, ddt_ptr, dD_ptr,
-    # Matrix dimensions
-    chunk_size, hdim, dstate,
-    batch, seqlen, nheads_ngroups_ratio,
-    # Strides
-    stride_x_batch, stride_x_seqlen, stride_x_head, stride_x_hdim,
-    stride_cb_batch, stride_cb_chunk, stride_cb_head, stride_cb_csize_m, stride_cb_csize_k,
-    stride_dout_batch, stride_dout_seqlen, stride_dout_head, stride_dout_hdim,
-    stride_dt_batch, stride_dt_chunk, stride_dt_head, stride_dt_csize,
-    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, stride_dA_cs_csize,
-    stride_seq_idx_batch, stride_seq_idx_seqlen,
-    stride_D_head,
-    stride_b_batch, stride_b_seqlen, stride_b_head, stride_b_dstate,
-    stride_dstates_batch, stride_dstates_chunk, stride_dstates_head, stride_dstates_hdim, stride_dstates_dstate,
-    stride_dx_batch, stride_dx_seqlen, stride_dx_head, stride_dx_hdim,
-    stride_ddt_batch, stride_ddt_chunk, stride_ddt_head, stride_ddt_csize,
-    stride_dD_batch, stride_dD_chunk, stride_dD_head, stride_dD_csize, stride_dD_hdim,
-    # Meta-parameters
-    HAS_D: tl.constexpr,
-    D_HAS_HDIM: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_DSTATE: tl.constexpr,
-    IS_TRITON_22: tl.constexpr,
-):
-    pid_bc = tl.program_id(axis=1)
-    pid_c = pid_bc // batch
-    pid_b = pid_bc - pid_c * batch
-    pid_h = tl.program_id(axis=2)
-    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
-    pid_m = tl.program_id(axis=0) // num_pid_n
-    pid_n = tl.program_id(axis=0) % num_pid_n
-    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
-    cb_ptr += pid_b * stride_cb_batch + pid_c * stride_cb_chunk + (pid_h // nheads_ngroups_ratio) * stride_cb_head
-    dout_ptr += pid_b * stride_dout_batch + pid_c * chunk_size * stride_dout_seqlen + pid_h * stride_dout_head
-    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
-    ddt_ptr += pid_b * stride_ddt_batch + pid_c * stride_ddt_chunk + pid_h * stride_ddt_head
-    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
-    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + (pid_h // nheads_ngroups_ratio) * stride_b_head
-    dstates_ptr += pid_b * stride_dstates_batch + pid_c * stride_dstates_chunk + pid_h * stride_dstates_head
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
-    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
-    if not HAS_SEQ_IDX:
-        # scale = tl.exp(dA_cs_last - dA_cs_m)
-        scale = tl.exp(tl.minimum((dA_cs_last - dA_cs_m), 0.0))
-    else:
-        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen, mask=offs_m < chunk_size_limit, other=-1)
-        seq_idx_last = tl.load(seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen)
-        # scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(dA_cs_last - dA_cs_m), 0.0)
-        scale = tl.where(seq_idx_m == seq_idx_last, tl.exp(tl.minimum((dA_cs_last - dA_cs_m), 0.0)), 0.0)
-    # Might be faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
-    # However, we're getting error with the Triton compiler 2.1.0 for that code path:
-    # Unexpected mma -> mma layout conversion
-    # Triton 2.2.0 fixes this
-    offs_dstate = tl.arange(0, BLOCK_SIZE_DSTATE if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K)
-    b_ptrs = b_ptr + (offs_m[:, None] * stride_b_seqlen + offs_dstate[None, :] * stride_b_dstate)
-    dstates_ptrs = dstates_ptr + (offs_n[None, :] * stride_dstates_hdim + offs_dstate[:, None] * stride_dstates_dstate)
-    if IS_TRITON_22 and BLOCK_SIZE_DSTATE <= 128:
-        b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_dstate[None, :] < dstate), other=0.0)
-        dstates = tl.load(dstates_ptrs, mask=(offs_dstate[:, None] < dstate) & (offs_n[None, :] < hdim), other=0.0)
-        dstates = dstates.to(b_ptr.dtype.element_ty)
-        acc = tl.dot(b, dstates) * scale[:, None]
-    else:
-        for k in range(0, dstate, BLOCK_SIZE_K):
-            b = tl.load(b_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_dstate[None, :] < dstate - k), other=0.0)
-            dstates = tl.load(dstates_ptrs, mask=(offs_dstate[:, None] < dstate - k) & (offs_n[None, :] < hdim), other=0.0)
-            dstates = dstates.to(b_ptr.dtype.element_ty)
-            acc += tl.dot(b, dstates)
-            b_ptrs += BLOCK_SIZE_K * stride_b_dstate
-            dstates_ptrs += BLOCK_SIZE_K * stride_dstates_dstate
-        acc *= scale[:, None]
-    # x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
-    # x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
-    # dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    # dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
-    # ddt = tl.sum(acc * x, axis=1) * dt_m
-    # ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    # tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m + offs_k[None, :] * stride_cb_csize_k)
-    dout_ptrs = dout_ptr + (offs_k[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim)
-    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
-    K_MAX = chunk_size_limit
-    K_MIN = pid_m * BLOCK_SIZE_M
-    cb_ptrs += K_MIN * stride_cb_csize_k
-    dout_ptrs += K_MIN * stride_dout_seqlen
-    dA_cumsum_ptrs += K_MIN * stride_dA_cs_csize
-    for k in range(K_MIN, K_MAX, BLOCK_SIZE_K):
-        k = tl.multiple_of(k, BLOCK_SIZE_K)
-        # For some reason setting mask to (offs_m[:, None] < chunk_size_limit) is much slower
-        cb = tl.load(cb_ptrs, mask=(offs_m[:, None] < chunk_size) & (offs_k[None, :] < K_MAX - k), other=0.0)
-        dout = tl.load(dout_ptrs, mask=(offs_k[:, None] < K_MAX - k) & (offs_n[None, :] < hdim), other=0.0)
-        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < K_MAX - k, other=0.0).to(tl.float32)
-        # cb *= tl.exp(dA_cs_k[None, :] - dA_cs_m[:, None])
-        cb *= tl.exp(tl.minimum((dA_cs_k[None, :] - dA_cs_m[:, None]), 0.0))
-        # If we don't have the (k + offs_k[None, :] < K_MAX) mask, for indices outside this range,
-        # we might have dA_cs_m = 0.0 and dA_cs_k very negative, and tl.exp will return inf.
-        # Multiplying with cb, which is 0.0 outside the range, will make the result NaN.
-        # This will cause NaN in acc, and hence NaN in dx and ddt.
-        mask = (k + offs_k[None, :] >= offs_m[:, None]) & (k + offs_k[None, :] < K_MAX)
-        cb = tl.where(mask, cb, 0.0)
-        cb = cb.to(dout_ptr.dtype.element_ty)
-        acc += tl.dot(cb, dout)
-        cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
-        dout_ptrs += BLOCK_SIZE_K * stride_dout_seqlen
-        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    dt_ptrs = dt_ptr + offs_m * stride_dt_csize
-    dt_m = tl.load(dt_ptrs, mask=offs_m < chunk_size_limit, other=0.0).to(tl.float32)
-    dx = acc * dt_m[:, None]
-    dx_ptr += pid_b * stride_dx_batch + pid_c * chunk_size * stride_dx_seqlen + pid_h * stride_dx_head
-    dx_ptrs = dx_ptr + (offs_m[:, None] * stride_dx_seqlen + offs_n[None, :] * stride_dx_hdim)
-    if HAS_D:
-        dout_res_ptrs = dout_ptr + (offs_m[:, None] * stride_dout_seqlen + offs_n[None, :] * stride_dout_hdim)
-        dout_res = tl.load(dout_res_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
-        if D_HAS_HDIM:
-            D = tl.load(D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0).to(tl.float32)
-        else:
-            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
-        dx += dout_res * D
-    tl.store(dx_ptrs, dx, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim))
-    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim)
-    x = tl.load(x_ptrs, mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), other=0.0).to(tl.float32)
-    if HAS_D:
-        dD_ptr += pid_b * stride_dD_batch + pid_c * stride_dD_chunk + pid_h * stride_dD_head + pid_m * stride_dD_csize
-        if D_HAS_HDIM:
-            dD_ptrs = dD_ptr + offs_n * stride_dD_hdim
-            dD = tl.sum(dout_res * x, axis=0)
-            tl.store(dD_ptrs, dD, mask=offs_n < hdim)
-        else:
-            dD = tl.sum(dout_res * x)
-            tl.store(dD_ptr, dD)
-    ddt = tl.sum(acc * x, axis=1)
-    ddt_ptrs = ddt_ptr + offs_m * stride_ddt_csize
-    tl.atomic_add(ddt_ptrs, ddt, mask=offs_m < chunk_size)
-def _chunk_scan_chunk_state_bwd_dx(x, dt, dA_cumsum, B, CB, dout, dstates, D=None, seq_idx=None, dx=None):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, nchunks, chunk_size = dt.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert CB.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
-    assert dt.shape == (batch, nheads, nchunks, chunk_size)
-    assert dA_cumsum.shape == dt.shape
-    assert dout.shape == x.shape
-    assert dstates.shape == (batch, nchunks, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
-        assert D.stride(-1) == 1
-        BLOCK_SIZE_min = 32
-        dD = torch.empty(triton.cdiv(chunk_size, BLOCK_SIZE_min), batch, nchunks, nheads,
-                         headdim if D.dim() == 2 else 1, device=D.device, dtype=torch.float32)
-    else:
-        dD = None
-    dD_strides = ((dD.stride(0), dD.stride(1), dD.stride(2), dD.stride(3), dD.stride(4))
-                    if D is not None else (0, 0, 0, 0, 0))
-    if dx is None:
-        dx = torch.empty_like(x)
-    else:
-        assert dx.shape == x.shape
-    ddt = torch.empty(batch, nheads, nchunks, chunk_size, device=dout.device, dtype=torch.float32)
-    grid_dx = lambda META: (triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(headdim, META['BLOCK_SIZE_N']),
-                        batch * nchunks, nheads)
-    with torch.cuda.device(x.device.index):
-        _chunk_scan_chunk_state_bwd_dx_kernel[grid_dx](
-            x, CB, dout, dt, dA_cumsum, seq_idx, D, B, dstates, dx, ddt, dD,
-            chunk_size, headdim, dstate,
-            batch, seqlen, nheads // ngroups,
-            x.stride(0), x.stride(1), x.stride(2), x.stride(3),
-            CB.stride(0), CB.stride(1), CB.stride(2), CB.stride(-1), CB.stride(-2),
-            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
-            dt.stride(0), dt.stride(2), dt.stride(1), dt.stride(3),
-            dA_cumsum.stride(0), dA_cumsum.stride(2), dA_cumsum.stride(1), dA_cumsum.stride(3),
-            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
-            D.stride(0) if D is not None else 0,
-            B.stride(0), B.stride(1), B.stride(2), B.stride(3),
-            dstates.stride(0), dstates.stride(1), dstates.stride(2), dstates.stride(3), dstates.stride(4),
-            dx.stride(0), dx.stride(1), dx.stride(2), dx.stride(3),
-            ddt.stride(0), ddt.stride(2), ddt.stride(1), ddt.stride(3),
-            dD_strides[1], dD_strides[2], dD_strides[3], dD_strides[0], dD_strides[4],
-            D is not None,
-            D.dim() == 2 if D is not None else True,
-            HAS_SEQ_IDX=seq_idx is not None,
-            BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
-            IS_TRITON_22=TRITON_22
-        )
-    if D is not None:
-        BLOCK_SIZE_actual = _chunk_scan_chunk_state_bwd_dx_kernel.best_config.kwargs["BLOCK_SIZE_M"]
-        n_valid_blocks = (chunk_size + BLOCK_SIZE_actual - 1) // BLOCK_SIZE_actual
-        dD = dD[:n_valid_blocks].sum(dim=(0, 1, 2)).to(dtype=D.dtype)
-        if D.dim() == 1:
-            dD = rearrange(dD, "h 1 -> h")
-    return dx, ddt.to(dtype=dt.dtype), dD
-def _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, initial_states=None, seq_idx=None, cu_seqlens=None, dt_softplus=False, dt_limit=(0.0, float("inf"))):
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert x.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, seqlen, nheads)
-    assert A.shape == (nheads,)
-    assert C.shape == B.shape
-    if z is not None:
-        assert z.shape == x.shape
-    if D is not None:
-        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if B.stride(-1) != 1:
-        B = B.contiguous()
-    if C.stride(-1) != 1:
-        C = C.contiguous()
-    if x.stride(-1) != 1 and x.stride(1) != 1:  # Either M or K dimension should be contiguous
-        x = x.contiguous()
-    if z is not None and z.stride(-1) != 1 and z.stride(1) != 1:  # Either M or K dimension should be contiguous
-        z = z.contiguous()
-    if D is not None and D.stride(-1) != 1:
-        D = D.contiguous()
-    if initial_states is not None:
-        assert initial_states.shape == (batch, nheads, headdim, dstate)
-    # # (batch, nchunks, chunk_size, chunk_size) or (batch, nchunks, nheads, chunk_size, chunk_size)
-    # dA_cumsum_tmp0, dt_tmp0 = _chunk_cumsum_fwd(dt[:, :147], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    # dA_cumsum_tmp1, dt_tmp1 = _chunk_cumsum_fwd(dt[:, 147:], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    # dA_cumsum_tmp2, dt_tmp2 = _chunk_cumsum_fwd(dt[:, 147:256], A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus)
-    dA_cumsum, dt = _chunk_cumsum_fwd(dt, A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit)
-    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
-    # states_tmp0 = _chunk_state_fwd(B[:, :147], x[:, :147], dt_tmp0, dA_cumsum_tmp0, states_in_fp32=True)
-    # states_tmp1 = _chunk_state_fwd(B[:, 147:], x[:, 147:], dt_tmp1, dA_cumsum_tmp1, states_in_fp32=True)
-    # states_tmp2 = _chunk_state_fwd(B[:, 147:256], x[:, 147:256], dt_tmp2, dA_cumsum_tmp2, states_in_fp32=True)
-    states, final_states = _state_passing_fwd(rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1],
-                                              initial_states=rearrange(initial_states, "... p n -> ... (p n)") if initial_states is not None else None,
-                                              seq_idx=seq_idx, chunk_size=chunk_size, out_dtype=C.dtype)
-    states, final_states = [rearrange(t, "... (p n) -> ... p n", n=dstate) for t in [states, final_states]]
-    # states_tmp0 = rearrange(_state_passing_fwd(rearrange(states_tmp0, "... p n -> ... (p n)"), dA_cumsum_tmp0[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
-    # states_tmp1 = rearrange(_state_passing_fwd(rearrange(states_tmp1, "... p n -> ... (p n)"), dA_cumsum_tmp1[:, :, :, -1], chunk_size=chunk_size), "... (p n) -> ... p n", n=dstate)
-    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
-    out, out_x = _chunk_scan_fwd(CB, x, dt, dA_cumsum, C, states, D=D, z=z, seq_idx=seq_idx)
-    if cu_seqlens is None:
-        return out, out_x, dt, dA_cumsum, states, final_states
-    else:
-        assert batch == 1, "passing cu_seqlens to get the varlen states is only supported if batch dimension is 1"
-        varlen_states = chunk_state_varlen(B.squeeze(0), x.squeeze(0), dt.squeeze(0), dA_cumsum.squeeze(0),
-                                           cu_seqlens, states.squeeze(0))
-        return out, out_x, dt, dA_cumsum, states, final_states, varlen_states
-def _mamba_chunk_scan_combined_bwd(dout, x, dt, A, B, C, out, chunk_size, D=None, z=None,
-                                   dt_bias=None, initial_states=None, dfinal_states=None, seq_idx=None, dt_softplus=False,
-                                   dt_limit=(0.0, float("inf")),
-                                   dx=None, ddt=None, dB=None, dC=None, dz=None, recompute_output=False):
-    if dout.stride(-1) != 1:
-        dout = dout.contiguous()
-    batch, seqlen, nheads, headdim = x.shape
-    nchunks = math.ceil(seqlen / chunk_size)
-    _, _, ngroups, dstate = B.shape
-    assert dout.shape == (batch, seqlen, nheads, headdim)
-    assert dt.shape == (batch, seqlen, nheads)
-    assert A.shape == (nheads,)
-    assert nheads % ngroups == 0
-    assert B.shape == (batch, seqlen, ngroups, dstate)
-    assert C.shape == B.shape
-    assert out.shape == x.shape
-    if initial_states is not None:
-        assert initial_states.shape == (batch, nheads, headdim, dstate)
-    if seq_idx is not None:
-        assert seq_idx.shape == (batch, seqlen)
-    if dx is not None:
-        assert dx.shape == x.shape
-    if dB is not None:
-        assert dB.shape == B.shape
-        dB_given = dB
-    else:
-        dB_given = torch.empty_like(B)
-    if dC is not None:
-        assert dC.shape == C.shape
-        dC_given = dC
-    else:
-        dC_given = torch.empty_like(C)
-    if dz is not None:
-        assert z is not None
-        assert dz.shape == z.shape
-    if ddt is not None:
-        assert ddt.shape == dt.shape
-        ddt_given = ddt
-    else:
-        ddt_given = torch.empty_like(dt)
-    # TD: For some reason Triton (2.1.0 and 2.2.0) errors with
-    # "[CUDA]: invalid device context" (e.g. during varlne test), and cloning makes it work. Idk why.
-    dt_in = dt.clone()
-    dA_cumsum, dt = _chunk_cumsum_fwd(dt_in, A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus,
-                                      dt_limit=dt_limit)
-    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
-    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
-    states, _ = _state_passing_fwd(rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1],
-                                   initial_states=rearrange(initial_states, "... p n -> ... (p n)") if initial_states is not None else None,
-                                   seq_idx=seq_idx, chunk_size=chunk_size)
-    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
-    if z is not None:
-        dz, dout, dD, *rest = _chunk_scan_bwd_dz(x, z, out, dout, chunk_size=chunk_size, has_ddAcs=False, D=D, dz=dz, recompute_output=recompute_output)
-        outz = rest[0] if recompute_output else out
-    else:
-        dz = None
-        outz = out
-    dstates = _chunk_scan_bwd_dstates(C, dA_cumsum, dout, seq_idx=seq_idx, dtype=states.dtype)
-    # dstates has length nchunks, containing the gradient to initial states at index 0 and
-    # gradient to the states of chunk (nchunks - 2) at index (nchunks - 1)
-    # Do computation in fp32 but convert dstates and states to fp16/bf16 since dstates and states
-    # will be used in matmul in the next kernels.
-    dstates, ddA_chunk_cumsum, dinitial_states, states = _state_passing_bwd(
-        rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
-        rearrange(dstates, "... p n -> ... (p n)"),
-        dfinal_states=rearrange(dfinal_states, "... p n -> ... (p n)") if dfinal_states is not None else None,
-        seq_idx=seq_idx,
-        has_initial_states=initial_states is not None,
-        dstates_dtype=x.dtype,
-        states_dtype=x.dtype,
-        chunk_size=chunk_size,
-    )
-    # dstates has length nchunks, containing the gradient to states of chunk 0 at index 0 and
-    # gradient to the final states at index (nchunks - 1)
-    # states has length nchunks, containing the initial states at index 0 and the state for chunk (nchunks - 2) at index (nchunks - 1)
-    # The final states is not stored.
-    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
-    dstates = rearrange(dstates, "... (p n) -> ... p n", n=dstate)
-    dinitial_states = rearrange(dinitial_states, "... (p n) -> ... p n", n=dstate) if dinitial_states is not None else None
-    dx, ddt, dD_from_x = _chunk_scan_chunk_state_bwd_dx(x, dt, dA_cumsum, B, CB, dout, dstates, D=D, seq_idx=seq_idx, dx=dx)
-    # dB = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=seq_idx, ngroups=ngroups)
-    dB, ddA_next = _chunk_state_bwd_db(x, dt, dA_cumsum, dstates, seq_idx=seq_idx, B=B, ngroups=ngroups)
-    # dC = _chunk_scan_bwd_dC(states[:, :-1].to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
-    dC, ddA_cumsum_prev = _chunk_scan_bwd_dC(states.to(x.dtype), dA_cumsum, dout, seq_idx=seq_idx, C=C, ngroups=ngroups)
-    # Computing ddA with the dcb kernel is much slower, so we're not using it for now
-    dCB = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, ngroups=ngroups)
-    # dCB, ddA_tmp = _chunk_scan_bwd_dcb(x, dt, dA_cumsum, dout, seq_idx=seq_idx, CB=CB, ngroups=ngroups)
-    dCB = dCB.to(CB.dtype)
-    _bmm_chunk_bwd(C, dCB, residual=dB, out=dB_given)
-    _bmm_chunk_bwd(B, rearrange(dCB, "... l s -> ... s l"), residual=dC, out=dC_given)
-    # If we have z, then dout_x is recomputed in fp32 so dD = (dout_x * x).sum() is more accurate
-    # than dD_from_x = (dout_x * x).sum() where dout_x is in fp16/bf16
-    if z is None:
-        dD = dD_from_x
-    # Formula for ddA_cumsum, assuming out is the output of the forward pass before adding x * D.
-    # ddA_cumsum = torch.einsum("bclhp,bclhp->bhcl", out.float(), dout.float()) - ddt * dt
-    # However, this is numerically unstable: when we do the reverse cumsum on ddA_cumsum, there might
-    # be a lot of underflow.
-    # This is already done as part of bwd_dC kernel
-    # ddA_cumsum_prev = _chunk_scan_bwd_ddAcs_prev(states[:, :-1], C, dout, dA_cumsum, seq_idx=seq_idx)
-    ddA_cumsum_prev[..., -1] += ddA_chunk_cumsum
-    ddA_prev = ddA_cumsum_prev.flip([-1]).cumsum(dim=-1).flip([-1])
-    # This is already done as part of bwd_dB kernel
-    # ddA_next = _chunk_state_bwd_ddAcs_stable(B, x, dt, dA_cumsum, dstates, seq_idx=seq_idx)
-    # We don't need to pass in seq_idx because CB also zeros out entries where seq_idx[i] != seq_idx[j]
-    ddA = _chunk_scan_bwd_ddAcs_stable(x, dt, dA_cumsum, dout, CB)
-    ddA += ddA_next + ddA_prev
-    ddt_given, dA, ddt_bias = _chunk_cumsum_bwd(ddA, ddt, dt_in, A, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit, ddt=ddt_given)
-    # These 2 lines are just to test ddt and dA being computed by old code
-    # _, dA = selective_scan_bwd(dout, x, dt, A, B, C, D=D.float(), z=z)
-    # ddt_given.copy_(ddt)
-    return_vals = (dx, ddt_given, dA, dB_given, dC_given, dD, dz, ddt_bias, dinitial_states)
-    return return_vals if not recompute_output else (*return_vals, outz)
-def selective_scan_bwd(dout, x, dt, A, B, C, D=None, z=None):
-    """
-    Argument:
-        dout: (batch, seqlen, nheads, headdim)
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, nheads, nchunks, chunk_size) or (batch, nheads, headdim, nchunks, chunk_size)
-        A: (nheads) or (dim, dstate)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    import selective_scan
-    batch, seqlen, nheads, headdim = x.shape
-    chunk_size = dt.shape[-1]
-    _, _, ngroups, dstate = B.shape
-    assert nheads % ngroups == 0
-    x = rearrange(x, "b l h p -> b (h p) l")
-    squeeze_dt = dt.dim() == 4
-    if dt.dim() == 4:
-        dt = repeat(dt, "b h c l -> b h p c l", p=headdim)
-    dt = rearrange(dt, "b h p c l -> b (h p) (c l)", p=headdim)
-    squeeze_A = A.dim() == 1
-    if A.dim() == 1:
-        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
-    else:
-        A = A.to(dtype=torch.float32)
-    B = rearrange(B, "b l g n -> b g n l")
-    C = rearrange(C, "b l g n -> b g n l")
-    if D is not None:
-        if D.dim() == 2:
-            D = rearrange(D, "h p -> (h p)")
-        else:
-            D = repeat(D, "h -> (h p)", p=headdim)
-    if z is not None:
-        z = rearrange(z, "b l h p -> b (h p) l")
-    if x.stride(-1) != 1:
-        x = x.contiguous()
-    if dt.stride(-1) != 1:
-        dt = dt.contiguous()
-    if D is not None:
-        D = D.contiguous()
-    if B.stride(-1) != 1:
-        B = B.contiguous()
-    if C.stride(-1) != 1:
-        C = C.contiguous()
-    if z is not None and z.stride(-1) != 1:
-        z = z.contiguous()
-    _, intermediate, *rest = selective_scan.fwd(x, dt.to(dtype=x.dtype), A, B, C, D, z, None, False)
-    if z is not None:
-        out = rest[0]
-    else:
-        out = None
-    dout = rearrange(dout, "b l h p -> b (h p) l")
-    if dout.stride(-1) != 1:
-        dout = dout.contiguous()
-    # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
-    # backward of selective_scan with the backward of chunk).
-    # Here we just pass in None and dz will be allocated in the C++ code.
-    _, ddt, dA, *rest = selective_scan.bwd(
-        x, dt.to(dtype=x.dtype), A, B, C, D, z, None, dout, intermediate, out, None, False,
-        False  # option to recompute out_z, not used here
-    )
-    ddt = rearrange(ddt, "b (h p) (c l) -> b h p c l", p=headdim, l=chunk_size)
-    if squeeze_dt:
-        ddt = ddt.float().sum(dim=2)
-    if squeeze_A:
-        dA = rearrange(dA, "(h p) n -> h p n", p=headdim).sum(dim=(1, 2))
-    return ddt, dA
-class MambaChunkScanCombinedFn(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, initial_states=None, seq_idx=None, cu_seqlens=None, dt_softplus=False, dt_limit=(0.0, float("inf")), return_final_states=False, return_varlen_states=False):
-        ctx.dt_dtype = dt.dtype
-        if not return_varlen_states:
-            cu_seqlens = None
-        else:
-            assert cu_seqlens is not None, "cu_seqlens must be provided if return_varlen_states is True"
-        out, out_x, dt_out, dA_cumsum, states, final_states, *rest = _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, cu_seqlens=cu_seqlens, dt_softplus=dt_softplus, dt_limit=dt_limit)
-        ctx.save_for_backward(out if z is None else out_x, x, dt, dA_cumsum, A, B, C, D, z, dt_bias, initial_states, seq_idx)
-        ctx.dt_softplus = dt_softplus
-        ctx.chunk_size = chunk_size
-        ctx.dt_limit = dt_limit
-        ctx.return_final_states = return_final_states
-        ctx.return_varlen_states = return_varlen_states
-        if not return_varlen_states:
-            return out if not return_final_states else (out, final_states)
-        else:
-            varlen_states = rest[0]
-            return (out, varlen_states) if not return_final_states else (out, final_states, varlen_states)
-    @staticmethod
-    def backward(ctx, dout, *args):
-        out, x, dt, dA_cumsum, A, B, C, D, z, dt_bias, initial_states, seq_idx = ctx.saved_tensors
-        assert not ctx.return_varlen_states, "return_varlen_states is not supported in backward"
-        dfinal_states = args[0] if ctx.return_final_states else None
-        dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states = _mamba_chunk_scan_combined_bwd(dout, x, dt, A, B, C, out, ctx.chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, dfinal_states=dfinal_states, seq_idx=seq_idx, dt_softplus=ctx.dt_softplus, dt_limit=ctx.dt_limit)
-        return dx, ddt, dA, dB, dC, None, dD, dz, ddt_bias, dinitial_states, None, None, None, None, None, None
-def mamba_chunk_scan_combined(x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, initial_states=None, seq_idx=None, cu_seqlens=None, dt_softplus=False, dt_limit=(0.0, float("inf")), return_final_states=False, return_varlen_states=False):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        chunk_size: int
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-        initial_states: (batch, nheads, headdim, dstate)
-        seq_idx: (batch, seqlen)
-        cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
-        dt_softplus: Whether to apply softplus to dt
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    return MambaChunkScanCombinedFn.apply(x, dt, A, B, C, chunk_size, D, z, dt_bias, initial_states, seq_idx, cu_seqlens, dt_softplus, dt_limit, return_final_states, return_varlen_states)
-def mamba_chunk_scan(x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    if seqlen % chunk_size != 0:
-        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
-    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
-    dt = dt.float()  # We want high precision for this before cumsum
-    if dt_bias is not None:
-        dt = dt + rearrange(dt_bias, "h -> h 1 1")
-    if dt_softplus:
-        dt = F.softplus(dt)
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA_cumsum = torch.cumsum(dA, dim=-1)
-    # 1. Compute the state for each chunk
-    states = chunk_state(B, x, dt, dA_cumsum, states_in_fp32=True)
-    # 2. Pass the state to all the chunks by weighted cumsum.
-    states = rearrange(state_passing(rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1])[0],
-                       "... (p n) -> ... p n", n=dstate)
-    # 3. Compute the output for each chunk
-    out = chunk_scan(B, C, x, dt, dA_cumsum, states, D=D, z=z)
-    return out
-def ssd_chunk_scan_combined_ref(x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, dt_softplus=False):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads)
-        A: (nheads)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    batch, seqlen, nheads, headdim = x.shape
-    dstate = B.shape[-1]
-    if seqlen % chunk_size != 0:
-        dt = F.pad(dt, (0, 0, 0, chunk_size - seqlen % chunk_size))
-    dt = rearrange(dt, "b (c l) h -> b h c l", l=chunk_size)
-    dt = dt.float()  # We want high precision for this before cumsum
-    if dt_bias is not None:
-        dt = dt + rearrange(dt_bias, "h -> h 1 1")
-    if dt_softplus:
-        dt = F.softplus(dt)
-    dA = dt * rearrange(A, "h -> h 1 1")
-    dA_cumsum = torch.cumsum(dA, dim=-1)
-    # 1. Compute the state for each chunk
-    states = chunk_state_ref(B, x, dt, dA_cumsum)
-    states_dtype = states.dtype
-    if states.dtype not in [torch.float32, torch.float64]:
-        states = states.to(torch.float32)
-    # 2. Pass the state to all the chunks by weighted cumsum.
-    # state_passing_ref is much less numerically stable
-    states = rearrange(state_passing_ref(rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1])[0],
-                       "... (p n) -> ... p n", n=dstate)
-    states = states.to(states_dtype)
-    # 3. Compute the output for each chunk
-    out = chunk_scan_ref(B, C, x, dt, dA_cumsum, states, D=D, z=z)
-    return out
-def ssd_selective_scan(x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf"))):
-    """
-    Argument:
-        x: (batch, seqlen, nheads, headdim)
-        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
-        A: (nheads) or (dim, dstate)
-        B: (batch, seqlen, ngroups, dstate)
-        C: (batch, seqlen, ngroups, dstate)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, nheads, headdim)
-        dt_bias: (nheads,) or (nheads, headdim)
-    Return:
-        out: (batch, seqlen, nheads, headdim)
-    """
-    from ..selective_scan_interface import selective_scan_fn
-    batch, seqlen, nheads, headdim = x.shape
-    _, _, ngroups, dstate = B.shape
-    x = rearrange(x, "b l h p -> b (h p) l")
-    if dt.dim() == 3:
-        dt = repeat(dt, "b l h -> b l h p", p=headdim)
-    dt = rearrange(dt, "b l h p -> b (h p) l")
-    if A.dim() == 1:
-        A = repeat(A, "h -> (h p) n", p=headdim, n=dstate).to(dtype=torch.float32)
-    else:
-        A = A.to(dtype=torch.float32)
-    B = rearrange(B, "b l g n -> b g n l")
-    C = rearrange(C, "b l g n -> b g n l")
-    if D is not None:
-        if D.dim() == 2:
-            D = rearrange(D, "h p -> (h p)")
-        else:
-            D = repeat(D, "h -> (h p)", p=headdim)
-    if z is not None:
-        z = rearrange(z, "b l h p -> b (h p) l")
-    if dt_bias is not None:
-        if dt_bias.dim() == 1:
-            dt_bias = repeat(dt_bias, "h -> h p", p=headdim)
-        dt_bias = rearrange(dt_bias, "h p -> (h p)")
-    if dt_limit != (0.0, float("inf")):
-        if dt_bias is not None:
-            dt = dt + rearrange(dt_bias, "d -> d 1")
-        if dt_softplus:
-            dt = F.softplus(dt)
-        dt = dt.clamp(min=dt_limit[0], max=dt_limit[1]).to(x.dtype)
-        dt_bias = None
-        dt_softplus = None
-    out = selective_scan_fn(x, dt, A, B, C, D=D, z=z, delta_bias=dt_bias, delta_softplus=dt_softplus)
-    return rearrange(out, "b (h p) l -> b l h p", p=headdim)
-def mamba_conv1d_scan_ref(xBC, conv1d_weight, conv1d_bias, dt, A, chunk_size, D=None, z=None,
-                          dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf")),
-                          activation="silu", headdim=None, ngroups=1):
-    """
-    Argument:
-        xBC: (batch, seqlen, dim + 2 * ngroups * dstate) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt: (batch, seqlen, nheads) or (batch, seqlen, nheads, headdim)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        z: (batch, seqlen, dim)
-        dt_bias: (nheads) or (nheads, headdim)
-        headdim: if D is 1D and z is None, headdim must be passed in
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    batch, seqlen, nheads = dt.shape[:3]
-    assert nheads % ngroups == 0
-    if z is not None:
-        dim = z.shape[-1]
-        assert dim % nheads == 0
-        headdim = dim // nheads
-    else:
-        if D.dim() == 1:
-            assert headdim is not None
-        else:
-            headdim = D.shape[1]
-        dim = nheads * headdim
-    xBC = rearrange(causal_conv1d_fn(rearrange(xBC, "b s d -> b d s"), conv1d_weight, conv1d_bias, activation=activation),
-                    "b d s -> b s d")
-    dstate = (xBC.shape[-1] - dim) // ngroups // 2
-    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
-    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-    z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
-    out = ssd_selective_scan(x, dt.to(x.dtype), A, B, C, D=D.float(), z=z, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit)
-    return rearrange(out, "b s h p -> b s (h p)")
-class MambaSplitConv1dScanCombinedFn(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states=None, seq_idx=None, dt_limit=(0.0, float("inf")), return_final_states=False, activation="silu",
-                rmsnorm_weight=None, rmsnorm_eps=1e-6, outproj_weight=None, outproj_bias=None, headdim=None,
-                ngroups=1, norm_before_gate=True):
-        assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        assert activation in [None, "silu", "swish"]
-        if D.dim() == 1:
-            assert headdim is not None
-            nheads, = D.shape
-        else:
-            nheads, headdim = D.shape
-        batch, seqlen, _ = zxbcdt.shape
-        dim = nheads * headdim
-        assert nheads % ngroups == 0
-        dstate = (conv1d_weight.shape[0] - dim) // ngroups // 2
-        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ngroups * dstate - nheads) // 2
-        assert d_nonssm >= 0
-        assert zxbcdt.shape == (batch, seqlen, 2 * d_nonssm + 2 * dim + 2 * ngroups * dstate + nheads)
-        assert dt_bias.shape == (nheads,)
-        assert A.shape == (nheads,)
-        zx0, z, xBC, dt = torch.split(zxbcdt, [2 * d_nonssm, dim, dim + ngroups * dstate * 2, nheads], dim=-1)
-        seq_idx = seq_idx.contiguous() if seq_idx is not None else None
-        xBC_conv = rearrange(
-            causal_conv1d_cuda.causal_conv1d_fwd(rearrange_and_update_stride(xBC, "b s d -> b d s"),
-                                                 conv1d_weight, conv1d_bias, seq_idx, None, None, activation in ["silu", "swish"]),
-            "b d s -> b s d"
-        )
-        x, B, C = torch.split(xBC_conv, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
-        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-        B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-        C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-        z = rearrange(z, "b l (h p) -> b l h p", h=nheads) if z is not None else None
-        if rmsnorm_weight is None:
-            out, out_x, dt_out, dA_cumsum, states, final_states = _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size=chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=dt_limit)
-            out = rearrange(out, "b s h p -> b s (h p)")
-            rstd = None
-            if d_nonssm > 0:
-                out = torch.cat([_swiglu_fwd(zx0), out], dim=-1)
-        else:
-            out_x, _, dt_out, dA_cumsum, states, final_states = _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size=chunk_size, D=D, z=None, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=dt_limit)
-            # reshape input data into 2D tensor
-            x_rms = rearrange(out_x, "b s h p -> (b s) (h p)")
-            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
-            rmsnorm_weight = rmsnorm_weight.contiguous()
-            if d_nonssm == 0:
-                out = None
-            else:
-                out01 = torch.empty((batch, seqlen, d_nonssm + dim), dtype=x_rms.dtype, device=x_rms.device)
-                out = rearrange(out01[..., d_nonssm:], "b s d -> (b s) d")
-                _swiglu_fwd(zx0, out=out01[..., :d_nonssm])
-            out, _, rstd = _layer_norm_fwd(x_rms, rmsnorm_weight, None, rmsnorm_eps, z_rms, out=out,
-                                           group_size=dim // ngroups,
-                                           norm_before_gate=norm_before_gate, is_rms_norm=True)
-            if d_nonssm == 0:
-                out = rearrange(out, "(b s) d -> b s d", b=batch)
-            else:
-                out = out01
-        ctx.outproj_weight_dtype = outproj_weight.dtype if outproj_weight is not None else None
-        if outproj_weight is not None:
-            if torch.is_autocast_enabled():
-                dtype = torch.get_autocast_gpu_dtype()
-                out, outproj_weight = out.to(dtype), outproj_weight.to(dtype)
-                outproj_bias = outproj_bias.to(dtype) if outproj_bias is not None else None
-            out = F.linear(out, outproj_weight, outproj_bias)
-        else:
-            assert outproj_bias is None
-        ctx.save_for_backward(zxbcdt, conv1d_weight, conv1d_bias,
-                              out_x, A, D, dt_bias, initial_states, seq_idx, rmsnorm_weight, rstd, outproj_weight, outproj_bias)
-        ctx.dt_limit = dt_limit
-        ctx.return_final_states = return_final_states
-        ctx.activation = activation
-        ctx.rmsnorm_eps = rmsnorm_eps
-        ctx.norm_before_gate = norm_before_gate
-        ctx.chunk_size = chunk_size
-        ctx.headdim = headdim
-        ctx.ngroups = ngroups
-        return out if not return_final_states else (out, final_states)
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, dout, *args):
-        assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d."
-        zxbcdt, conv1d_weight, conv1d_bias, out, A, D, dt_bias, initial_states, seq_idx, rmsnorm_weight, rstd, outproj_weight, outproj_bias = ctx.saved_tensors
-        dfinal_states = args[0] if ctx.return_final_states else None
-        headdim = ctx.headdim
-        nheads = D.shape[0]
-        dim = nheads * headdim
-        assert nheads % ctx.ngroups == 0
-        dstate = (conv1d_weight.shape[0] - dim) // ctx.ngroups // 2
-        d_nonssm = (zxbcdt.shape[-1] - 2 * dim - 2 * ctx.ngroups * dstate - nheads) // 2
-        assert d_nonssm >= 0
-        recompute_output = outproj_weight is not None
-        if recompute_output:
-            out_recompute = torch.empty(*out.shape[:2], d_nonssm + dim, device=out.device, dtype=out.dtype)
-            out0_recompute, out1_recompute = out_recompute.split([d_nonssm, dim], dim=-1)
-        zx0, z, xBC, dt = torch.split(zxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1)
-        # Recompute x, B, C
-        xBC_conv = rearrange(
-            causal_conv1d_cuda.causal_conv1d_fwd(rearrange_and_update_stride(xBC, "b s d -> b d s"),
-                                       conv1d_weight, conv1d_bias, seq_idx, None, None, ctx.activation in ["silu", "swish"]),
-            "b d s -> b s d"
-        )
-        x, B, C = torch.split(xBC_conv, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1)
-        x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-        B = rearrange(B, "b l (g n) -> b l g n", g=ctx.ngroups)
-        C = rearrange(C, "b l (g n) -> b l g n", g=ctx.ngroups)
-        dzxbcdt = torch.empty_like(zxbcdt)
-        dzx0, dz, dxBC_given, ddt_given = torch.split(dzxbcdt, [2 * d_nonssm, dim, dim + 2 * ctx.ngroups * dstate, nheads], dim=-1)
-        dxBC = torch.empty_like(xBC)
-        dx, dB, dC = torch.split(dxBC, [dim, ctx.ngroups * dstate, ctx.ngroups * dstate], dim=-1)
-        z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
-        dx = rearrange(dx, "b l (h p) -> b l h p", h=nheads)
-        dB = rearrange(dB, "b l (g n) -> b l g n", g=ctx.ngroups)
-        dC = rearrange(dC, "b l (g n) -> b l g n", g=ctx.ngroups)
-        if outproj_weight is not None:
-            dout_og = dout
-            dout = F.linear(dout, outproj_weight.t())
-        if d_nonssm > 0:
-            dout0, dout = dout.split([d_nonssm, dim], dim=-1)
-            _swiglu_bwd(zx0, dout0, dxy=dzx0, recompute_output=True, out=out0_recompute)
-        dout = rearrange(dout, "b s (h p) -> b s h p", p=headdim)
-        if rmsnorm_weight is None:
-            dz = rearrange(dz, "b l (h p) -> b l h p", h=nheads)
-            dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states, *rest = _mamba_chunk_scan_combined_bwd(
-                dout, x, dt, A, B, C, out, ctx.chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, dfinal_states=dfinal_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=ctx.dt_limit, dx=dx, ddt=ddt_given, dB=dB, dC=dC, dz=dz, recompute_output=recompute_output
-            )
-            out_for_linear = rearrange(rest[0], "b s h p -> b s (h p)") if recompute_output else None
-            drmsnorm_weight = None
-        else:
-            batch = dout.shape[0]
-            dy_rms = rearrange(dout, "b s h p -> (b s) (h p)")
-            dz = rearrange(dz, "b l d -> (b l) d")
-            x_rms = rearrange(out, "b s h p -> (b s) (h p)")
-            z_rms = rearrange(z, "b s h p -> (b s) (h p)")
-            out1_recompute = rearrange(out1_recompute, "b s d -> (b s) d") if recompute_output else None
-            dout, drmsnorm_weight, _, dz, *rest = _layer_norm_bwd(dy_rms, x_rms, rmsnorm_weight, None, ctx.rmsnorm_eps, None, rstd, z_rms, group_size=dim//ctx.ngroups, norm_before_gate=ctx.norm_before_gate, is_rms_norm=True, recompute_output=recompute_output, dz=dz, out=out1_recompute if recompute_output else None)
-            out_for_linear = out_recompute if recompute_output else None
-            dout = rearrange(dout, "(b s) (h p) -> b s h p", b=batch, p=headdim)
-            dx, ddt, dA, dB, dC, dD, _, ddt_bias, dinitial_states = _mamba_chunk_scan_combined_bwd(
-                dout, x, dt, A, B, C, out, ctx.chunk_size, D=D, z=None, dt_bias=dt_bias, initial_states=initial_states, dfinal_states=dfinal_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=ctx.dt_limit, dx=dx, ddt=ddt_given, dB=dB, dC=dC
-            )
-        if outproj_weight is not None:
-            doutproj_weight = torch.einsum("bso,bsd->od", dout_og, out_for_linear)
-            doutproj_bias = dout_og.sum(dim=(0, 1)) if outproj_bias is not None else None
-        else:
-            doutproj_weight, doutproj_bias = None, None
-        dxBC_given = rearrange(dxBC_given, "b s d -> b d s")
-        dxBC_given_update, dweight, dbias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
-            rearrange_and_update_stride(xBC, "b s d -> b d s"), conv1d_weight, conv1d_bias,
-            rearrange(dxBC, "b s d -> b d s"), seq_idx, None, None, rearrange_and_update_stride(dxBC_given), False, ctx.activation in ["silu", "swish"]
-        )
-        if dxBC_given.stride() != dxBC_given_update.stride():
-            dxBC_given.copy_(dxBC_given_update)
-        else:
-            dxBC_given = dxBC_given_update
-        dxBC_given = rearrange(dxBC_given, "b d s -> b s d")
-        return dzxbcdt, dweight, dbias, ddt_bias, dA, dD, None, dinitial_states, None, None, None, None, drmsnorm_weight, None, doutproj_weight, doutproj_bias, None, None, None
-def mamba_split_conv1d_scan_combined(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states=None, seq_idx=None, dt_limit=(0.0, float("inf")), return_final_states=False, activation="silu", rmsnorm_weight=None, rmsnorm_eps=1e-6, outproj_weight=None, outproj_bias=None, headdim=None, ngroups=1, norm_before_gate=True):
-    """
-    Argument:
-        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt_bias: (nheads,)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        initial_states: (batch, nheads, headdim, dstate)
-        seq_idx: (batch, seqlen), int32
-        rmsnorm_weight: (dim,)
-        outproj_weight: (out_dim, dim)
-        outproj_bias: (out_dim,)
-        headdim: if D is 1D, headdim must be passed in
-        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    return MambaSplitConv1dScanCombinedFn.apply(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states, seq_idx, dt_limit, return_final_states, activation, rmsnorm_weight, rmsnorm_eps, outproj_weight, outproj_bias, headdim, ngroups, norm_before_gate)
-def mamba_split_conv1d_scan_ref(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, dt_limit=(0.0, float("inf")), activation="silu", rmsnorm_weight=None, rmsnorm_eps=1e-6, outproj_weight=None, outproj_bias=None, headdim=None, ngroups=1, norm_before_gate=True):
-    """
-    Argument:
-        zxbcdt: (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads) where dim == nheads * headdim
-        conv1d_weight: (dim + 2 * ngroups * dstate, width)
-        conv1d_bias: (dim + 2 * ngroups * dstate,)
-        dt_bias: (nheads,)
-        A: (nheads)
-        D: (nheads, headdim) or (nheads,)
-        rmsnorm_weight: (dim,)
-        outproj_weight: (out_dim, dim)
-        outproj_bias: (out_dim,)
-        headdim: if D is 1D, headdim must be passed in
-        norm_before_gate: if True, we do RMSNorm(x) * F.silu(z). If False, we do RMSNorm(x * F.silu(z))
-    Return:
-        out: (batch, seqlen, dim)
-    """
-    if D.dim() == 1:
-        assert headdim is not None
-        nheads, = D.shape
-    else:
-        nheads, headdim = D.shape
-    assert nheads % ngroups == 0
-    batch, seqlen, _ = zxbcdt.shape
-    dim = nheads * headdim
-    dstate = (zxbcdt.shape[-1] - 2 * dim - nheads) // ngroups // 2
-    assert zxbcdt.shape == (batch, seqlen, 2 * dim + 2 * ngroups * dstate + nheads)
-    assert dt_bias.shape == (nheads,)
-    assert A.shape == (nheads,)
-    if rmsnorm_weight is not None:
-        assert rmsnorm_weight.shape == (dim,)
-    z, xBC, dt = torch.split(zxbcdt, [dim, dim + 2 * ngroups * dstate, nheads], dim=-1)
-    xBC = rearrange(causal_conv1d_fn(rearrange(xBC, "b s d -> b d s"), conv1d_weight, conv1d_bias, activation=activation),
-                    "b d s -> b s d")
-    x, B, C = torch.split(xBC, [dim, ngroups * dstate, ngroups * dstate], dim=-1)
-    x = rearrange(x, "b l (h p) -> b l h p", h=nheads)
-    B = rearrange(B, "b l (g n) -> b l g n", g=ngroups)
-    C = rearrange(C, "b l (g n) -> b l g n", g=ngroups)
-    z = rearrange(z, "b l (h p) -> b l h p", h=nheads)
-    out = ssd_selective_scan(x, dt.to(x.dtype), A, B, C, D=D.float(),
-                             z=z if rmsnorm_weight is None else None, dt_bias=dt_bias, dt_softplus=True, dt_limit=dt_limit)
-    out = rearrange(out, "b s h p -> b s (h p)")
-    if rmsnorm_weight is not None:
-        out = rmsnorm_fn(out, rmsnorm_weight, None, z=rearrange(z, "b l h p -> b l (h p)"), eps=rmsnorm_eps,
-                         norm_before_gate=norm_before_gate)
-    if outproj_weight is not None:
-        out = F.linear(out, outproj_weight, outproj_bias)
-    return out

build/torch210-cxx11-cu126-x86_64-linux/ops/triton/ssd_state_passing.py DELETED Viewed

@@ -1,348 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-"""We want triton==2.1.0 or 2.2.0 for this
-"""
-import math
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-from einops import rearrange, repeat
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE': 64}),
-        triton.Config({'BLOCK_SIZE': 128}),
-        triton.Config({'BLOCK_SIZE': 256}),
-        triton.Config({'BLOCK_SIZE': 512}),
-        triton.Config({'BLOCK_SIZE': 1024}),
-        triton.Config({'BLOCK_SIZE': 2048}),
-    ],
-    key=['dim'],
-)
-@triton.jit
-def _state_passing_fwd_kernel(
-    # Pointers to matrices
-    states_ptr, out_ptr, final_states_ptr, dA_cs_ptr, initstates_ptr, seq_idx_ptr,
-    # Matrix dimensions
-    dim, nchunks, seqlen, chunk_size,
-    # Strides
-    stride_states_batch, stride_states_chunk, stride_states_head, stride_states_dim,
-    stride_out_batch, stride_out_chunk, stride_out_head, stride_out_dim,
-    stride_final_states_batch, stride_final_states_head, stride_final_states_dim,
-    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head,
-    stride_initstates_batch, stride_initstates_head, stride_initstates_dim,
-    stride_seq_idx_batch, stride_seq_idx_seqlen,
-    # Meta-parameters
-    HAS_INITSTATES: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    pid_m = tl.program_id(axis=0)
-    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
-    dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head
-    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
-    final_states_ptr += pid_b * stride_final_states_batch + pid_h * stride_final_states_head
-    if HAS_INITSTATES:
-        initstates_ptr += pid_b * stride_initstates_batch + pid_h * stride_initstates_head
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += pid_b * stride_seq_idx_batch
-    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-    states_ptrs = states_ptr + offs_m * stride_states_dim
-    out_ptrs = out_ptr + offs_m * stride_out_dim
-    final_states_ptrs = final_states_ptr + offs_m * stride_final_states_dim
-    if not HAS_INITSTATES:
-        states = tl.zeros((BLOCK_SIZE, ), dtype=tl.float32)
-    else:
-        initstates_ptrs = initstates_ptr + offs_m * stride_initstates_dim
-        states = tl.load(initstates_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-    tl.store(out_ptrs, states, mask=offs_m < dim)
-    out_ptrs += stride_out_chunk
-    seq_idx = 0
-    for c in range(nchunks):
-        new_states = tl.load(states_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
-        scale = tl.exp(dA_cs)
-        if HAS_SEQ_IDX:
-            seq_idx_new = tl.load(seq_idx_ptr + (min((c + 1) * chunk_size, seqlen) - 1) * stride_seq_idx_seqlen)
-            scale = tl.where(seq_idx_new == seq_idx, scale, 0.0)
-            seq_idx = seq_idx_new
-        states = scale * states + new_states
-        if c < nchunks - 1:
-            tl.store(out_ptrs, states, mask=offs_m < dim)
-        else:
-            tl.store(final_states_ptrs, states, mask=offs_m < dim)
-        states_ptrs += stride_states_chunk
-        dA_cs_ptr += stride_dA_cs_chunk
-        out_ptrs += stride_out_chunk
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE': 64}),
-        triton.Config({'BLOCK_SIZE': 128}),
-        triton.Config({'BLOCK_SIZE': 256}),
-        triton.Config({'BLOCK_SIZE': 512}),
-        triton.Config({'BLOCK_SIZE': 1024}),
-        triton.Config({'BLOCK_SIZE': 2048}),
-    ],
-    key=['dim'],
-)
-@triton.jit
-def _state_passing_bwd_kernel(
-    # Pointers to matrices
-    dout_ptr, out_ptr, dA_cs_ptr, dfinal_states_ptr, seq_idx_ptr,
-    dstates_ptr, ddA_cs_ptr, dinitstates_ptr, states_converted_ptr,
-    # Matrix dimensions
-    dim, nchunks, seqlen, chunk_size,
-    # Strides
-    stride_dout_batch, stride_dout_chunk, stride_dout_head, stride_dout_dim,
-    stride_out_batch, stride_out_chunk, stride_out_head, stride_out_dim,
-    stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head,
-    stride_dfinal_states_batch, stride_dfinal_states_head, stride_dfinal_states_dim,
-    stride_seq_idx_batch, stride_seq_idx_seqlen,
-    stride_dstates_batch, stride_dstates_chunk, stride_dstates_head, stride_dstates_dim,
-    stride_ddA_cs_batch, stride_ddA_cs_chunk, stride_ddA_cs_head,
-    stride_dinitstates_batch, stride_dinitstates_head, stride_dinitstates_dim,
-    # Meta-parameters
-    CONVERT_STATES: tl.constexpr,
-    HAS_DFINAL_STATES: tl.constexpr,
-    HAS_DINITSTATES: tl.constexpr,
-    HAS_SEQ_IDX: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    pid_b = tl.program_id(axis=1)
-    pid_h = tl.program_id(axis=2)
-    pid_m = tl.program_id(axis=0)
-    dstates_ptr += pid_b * stride_dstates_batch + pid_h * stride_dstates_head + (nchunks - 1) * stride_dstates_chunk
-    dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head + (nchunks - 1) * stride_dA_cs_chunk
-    ddA_cs_ptr += pid_b * stride_ddA_cs_batch + pid_h * stride_ddA_cs_head + (nchunks - 1) * stride_ddA_cs_chunk + pid_m
-    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head + (nchunks - 1) * stride_out_chunk
-    dout_ptr += pid_b * stride_dout_batch + pid_h * stride_dout_head + (nchunks - 1) * stride_dout_chunk
-    if CONVERT_STATES:
-        states_converted_ptr += pid_b * stride_out_batch + pid_h * stride_out_head + (nchunks - 1) * stride_out_chunk
-    if HAS_DFINAL_STATES:
-        dfinal_states_ptr += pid_b * stride_dfinal_states_batch + pid_h * stride_dfinal_states_head
-    if HAS_DINITSTATES:
-        dinitstates_ptr += pid_b * stride_dinitstates_batch + pid_h * stride_dinitstates_head
-    if HAS_SEQ_IDX:
-        seq_idx_ptr += pid_b * stride_seq_idx_batch
-    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-    dstates_ptrs = dstates_ptr + offs_m * stride_dstates_dim
-    out_ptrs = out_ptr + offs_m * stride_out_dim
-    dout_ptrs = dout_ptr + offs_m * stride_dout_dim
-    if CONVERT_STATES:
-        states_converted_ptrs = states_converted_ptr + offs_m * stride_out_dim
-    if HAS_DFINAL_STATES:
-        dstates = tl.load(dfinal_states_ptr + offs_m * stride_dfinal_states_dim, mask=offs_m < dim, other=0.0).to(tl.float32)
-    else:
-        dstates = tl.zeros((BLOCK_SIZE, ), dtype=tl.float32)
-    tl.store(dstates_ptrs, dstates, mask=offs_m < dim)
-    if HAS_SEQ_IDX:
-        seq_idx = tl.load(seq_idx_ptr + (seqlen - 1) * stride_seq_idx_seqlen)
-    dstates_ptrs -= stride_dstates_chunk
-    for c in range(nchunks - 1):
-        dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
-        scale = tl.exp(dA_cs)
-        if HAS_SEQ_IDX:
-            seq_idx_new = tl.load(seq_idx_ptr + (((nchunks - c - 1) * chunk_size - 1) * stride_seq_idx_seqlen))
-            scale = tl.where(seq_idx_new == seq_idx, scale, 0.0)
-            seq_idx = seq_idx_new
-        out = tl.load(out_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        if CONVERT_STATES:
-            tl.store(states_converted_ptrs, out, mask=offs_m < dim)
-        ddA = tl.sum(out * dstates) * scale
-        tl.store(ddA_cs_ptr, ddA)
-        dout = tl.load(dout_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        dstates = scale * dstates + dout
-        tl.store(dstates_ptrs, dstates, mask=offs_m < dim)
-        dout_ptrs -= stride_dout_chunk
-        dstates_ptrs -= stride_dstates_chunk
-        dA_cs_ptr -= stride_dA_cs_chunk
-        ddA_cs_ptr -= stride_ddA_cs_chunk
-        out_ptrs -= stride_out_chunk
-        if CONVERT_STATES:
-            states_converted_ptrs -= stride_out_chunk
-    if CONVERT_STATES:
-        out = tl.load(out_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        tl.store(states_converted_ptrs, out, mask=offs_m < dim)
-    if not HAS_DINITSTATES:
-        tl.store(ddA_cs_ptr, 0.0)
-    else:
-        dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
-        scale = tl.exp(dA_cs)
-        if HAS_SEQ_IDX:
-            scale = tl.where(seq_idx == 0, scale, 0.0)
-        out = tl.load(out_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        ddA = tl.sum(out * dstates) * scale
-        tl.store(ddA_cs_ptr, ddA)
-        dout = tl.load(dout_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
-        dstates = scale * dstates + dout
-        tl.store(dinitstates_ptr + offs_m * stride_dinitstates_dim, dstates, mask=offs_m < dim)
-def _state_passing_fwd(states, dA_chunk_cumsum, initial_states=None, seq_idx=None, chunk_size=None,
-                       out_dtype=None):
-    batch, nchunks, nheads, dim = states.shape
-    assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
-    if initial_states is not None:
-        assert initial_states.shape == (batch, nheads, dim)
-    if seq_idx is not None:
-        assert chunk_size is not None
-        seqlen = seq_idx.shape[-1]
-        assert seq_idx.shape == (batch, seqlen)
-    out_dtype = states.dtype if out_dtype is None else out_dtype
-    out = torch.empty((batch, nchunks, nheads, dim), device=states.device, dtype=out_dtype)
-    final_states = torch.empty((batch, nheads, dim), device=states.device, dtype=torch.float32)
-    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE']), batch, nheads)
-    with torch.cuda.device(states.device.index):
-        _state_passing_fwd_kernel[grid](
-            states, out, final_states, dA_chunk_cumsum, initial_states, seq_idx,
-            dim, nchunks, seqlen if seq_idx is not None else 0, chunk_size if seq_idx is not None else 0,
-            states.stride(0), states.stride(1), states.stride(2), states.stride(3),
-            out.stride(0), out.stride(1), out.stride(2), out.stride(3),
-            final_states.stride(0), final_states.stride(1), final_states.stride(2),
-            dA_chunk_cumsum.stride(0), dA_chunk_cumsum.stride(2), dA_chunk_cumsum.stride(1),
-            *((initial_states.stride(0), initial_states.stride(1), initial_states.stride(2))
-              if initial_states is not None else (0, 0, 0)),
-            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
-            HAS_INITSTATES=initial_states is not None,
-            HAS_SEQ_IDX=seq_idx is not None,
-        )
-    return out, final_states
-def _state_passing_bwd(
-        states, dA_chunk_cumsum, dout, dfinal_states=None, seq_idx=None, has_initial_states=None,
-        dstates_dtype=None, states_dtype=None, chunk_size=None
-):
-    """
-    states contains the initial_states at index 0. The final states are not included in states.
-    """
-    batch, nchunks, nheads, dim = states.shape
-    assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
-    assert dout.shape == (batch, nchunks, nheads, dim)
-    if seq_idx is not None:
-        assert chunk_size is not None
-        seqlen = seq_idx.shape[-1]
-        assert seq_idx.shape == (batch, seqlen)
-    dstates = torch.empty_like(dout, dtype=dstates_dtype if dstates_dtype is not None else dout.dtype)
-    if states_dtype is not None and states_dtype != states.dtype:
-        states_converted = torch.empty_like(states, dtype=dstates_dtype if dstates_dtype is not None else dout.dtype)
-        assert states_converted.stride() == states.stride()
-    else:
-        states_converted = None
-    if has_initial_states:
-        dinitstates = torch.empty_like(dstates[:, 0])
-    else:
-        dinitstates = None
-    if dfinal_states is not None:
-        assert dfinal_states.shape == (batch, nheads, dim)
-    BLOCK_SIZE_min = 64
-    n_blocks = (dim + BLOCK_SIZE_min - 1) // BLOCK_SIZE_min
-    ddA_chunk_cumsum = torch.empty(batch, nheads, nchunks, n_blocks,
-                                    dtype=torch.float32, device=dA_chunk_cumsum.device)
-    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE']), batch, nheads)
-    with torch.cuda.device(dout.device.index):
-        _state_passing_bwd_kernel[grid](
-            dout, states, dA_chunk_cumsum, dfinal_states, seq_idx,
-            dstates, ddA_chunk_cumsum, dinitstates, states_converted,
-            dim, nchunks, seqlen if seq_idx is not None else 0, chunk_size if seq_idx is not None else 0,
-            dout.stride(0), dout.stride(1), dout.stride(2), dout.stride(3),
-            states.stride(0), states.stride(1), states.stride(2), states.stride(3),
-            dA_chunk_cumsum.stride(0), dA_chunk_cumsum.stride(2), dA_chunk_cumsum.stride(1),
-            *((dfinal_states.stride(0), dfinal_states.stride(1), dfinal_states.stride(2))
-                if dfinal_states is not None else (0, 0, 0)),
-            *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
-            dstates.stride(0), dstates.stride(1), dstates.stride(2), dstates.stride(3),
-            ddA_chunk_cumsum.stride(0), ddA_chunk_cumsum.stride(2), ddA_chunk_cumsum.stride(1),
-            *((dinitstates.stride(0), dinitstates.stride(1), dinitstates.stride(2))
-              if dinitstates is not None else (0, 0, 0)),
-            CONVERT_STATES=states_converted is not None,
-            HAS_DFINAL_STATES=dfinal_states is not None,
-            HAS_DINITSTATES=dinitstates is not None,
-            HAS_SEQ_IDX=seq_idx is not None,
-        )
-    BLOCK_SIZE_actual = _state_passing_bwd_kernel.best_config.kwargs["BLOCK_SIZE"]
-    n_valid_blocks = (dim + BLOCK_SIZE_actual - 1) // BLOCK_SIZE_actual
-    ddA_chunk_cumsum = ddA_chunk_cumsum[..., :n_valid_blocks].sum(dim=-1).to(dtype=dA_chunk_cumsum.dtype)
-    if states_dtype is not None and states_dtype == states.dtype:
-        states_converted = states
-    return (dstates, ddA_chunk_cumsum, dinitstates) if states_dtype is None else (dstates, ddA_chunk_cumsum, dinitstates, states_converted)
-class StatePassingFn(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, states, dA_chunk_cumsum, initial_states=None):
-        batch, nchunks, nheads, dim = states.shape
-        assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
-        if states.stride(-1) != 1:
-            states = states.contiguous()
-        out, final_states = _state_passing_fwd(states, dA_chunk_cumsum, initial_states)
-        ctx.save_for_backward(out, dA_chunk_cumsum)
-        ctx.has_initial_states = initial_states is not None
-        return out, final_states
-    @staticmethod
-    def backward(ctx, dout, dfinal_states):
-        out, dA_chunk_cumsum = ctx.saved_tensors
-        batch, nchunks, nheads, dim = out.shape
-        assert dout.shape == (batch, nchunks, nheads, dim)
-        assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
-        assert dfinal_states.shape == (batch, nheads, dim)
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
-        dstates, ddA_chunk_cumsum, dinitstates = _state_passing_bwd(
-            out, dA_chunk_cumsum, dout, dfinal_states=dfinal_states , has_initial_states=ctx.has_initial_states
-        )
-        return dstates, ddA_chunk_cumsum, dinitstates
-def state_passing(states, dA_chunk_cumsum, initial_states=None):
-    """
-    Argument:
-        states: (batch, nchunks, nheads, dim)
-        dA_chunk_cumsum: (batch, nheads, nchunks)
-        initial_states: (batch, nheads, dim)
-    Return:
-        out: (batch, nchunks, nheads, dim)
-        final_states: (batch, nheads, dim)
-    """
-    return StatePassingFn.apply(states, dA_chunk_cumsum, initial_states)
-def state_passing_ref(states, dA_chunk_cumsum, initial_states=None):
-    """
-    Argument:
-        states: (batch, nchunks, nheads, dim)
-        dA_chunk_cumsum: (batch, nheads, nchunks)
-        initial_states: (batch, nheads, dim)
-    Return:
-        out: (batch, nchunks, nheads, dim)
-        final_states: (batch, nheads, dim)
-    """
-    if initial_states is None:
-        initial_states = torch.zeros_like(states[:, 0])
-    states = torch.cat([rearrange(initial_states, "b h d -> b 1 h d"), states], dim=1)
-    dA_chunk_cumsum = F.pad(dA_chunk_cumsum, (1, 0))
-    dA_chunk_cumsum = torch.cumsum(dA_chunk_cumsum, dim=-1)
-    nchunks = dA_chunk_cumsum.shape[-1]
-    # (batch, nheads, nchunks, nchunks)
-    dt_chunk_segment_sum = dA_chunk_cumsum[:, :, :, None] - dA_chunk_cumsum[:, :, None, :]
-    # (batch, nheads, nchunks, nchunks)
-    decay_chunk = torch.exp(dt_chunk_segment_sum)
-    causal_mask = torch.tril(torch.ones(nchunks, nchunks, device=states.device, dtype=bool), diagonal=0)
-    decay_chunk = decay_chunk.masked_fill(~causal_mask, 0)
-    out = torch.einsum("bhzc,bchd->bzhd", decay_chunk.to(dtype=states.dtype), states)
-    return out[:, :-1], out[:, -1]

build/torch210-cxx11-cu126-x86_64-linux/utils/__init__.py DELETED Viewed

File without changes

build/torch210-cxx11-cu126-x86_64-linux/utils/generation.py DELETED Viewed

@@ -1,390 +0,0 @@
-# Copyright (c) 2023, Albert Gu, Tri Dao.
-import gc
-import time
-from collections import namedtuple
-from dataclasses import dataclass, field
-from functools import partial
-from typing import Callable, Optional, Sequence, Union
-import torch
-import torch.nn.functional as F
-from einops import rearrange, repeat
-from torch import Tensor
-from torch.profiler import ProfilerActivity, profile, record_function
-from transformers.generation import GreedySearchDecoderOnlyOutput, SampleDecoderOnlyOutput, TextStreamer
-@dataclass
-class InferenceParams:
-    """Inference parameters that are passed to the main model in order
-    to efficienly calculate and store the context during inference."""
-    max_seqlen: int
-    max_batch_size: int
-    seqlen_offset: int = 0
-    batch_size_offset: int = 0
-    key_value_memory_dict: dict = field(default_factory=dict)
-    lengths_per_sample: Optional[Tensor] = None
-    def reset(self, max_seqlen, max_batch_size):
-        self.max_seqlen = max_seqlen
-        self.max_batch_size = max_batch_size
-        self.seqlen_offset = 0
-        if self.lengths_per_sample is not None:
-            self.lengths_per_sample.zero_()
-def modify_logits_for_min_p_filtering(logits, min_p):
-    """Set the logits for none min_p values to -inf. Done in-place."""
-    if min_p <= 0.0 or min_p >= 1.0:
-        return
-    indices_to_remove = logits < min_p
-    logits.masked_fill_(indices_to_remove, float("-Inf"))
-# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
-# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L231
-def modify_logits_for_top_k_filtering(logits, top_k):
-    """Set the logits for none top-k values to -inf. Done in-place."""
-    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-    logits.masked_fill_(indices_to_remove, float("-Inf"))
-# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
-# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L170
-def modify_logits_for_top_p_filtering(logits, top_p):
-    """Set the logits for none top-p values to -inf. Done in-place."""
-    if top_p <= 0.0 or top_p >= 1.0:
-        return
-    # First sort and calculate cumulative sum of probabilities.
-    sorted_logits, sorted_indices = torch.sort(logits, descending=False)
-    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
-    # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
-    sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
-    # scatter sorted tensors to original indexing
-    indices_to_remove = sorted_indices_to_remove.scatter(
-        1, sorted_indices, sorted_indices_to_remove
-    )
-    logits.masked_fill_(indices_to_remove, float("-inf"))
-def modify_logit_for_repetition_penalty(logits, prev_output_tokens, repetition_penalty=1.0):
-    """Apply repetition penalty. See https://arxiv.org/abs/1909.05858
-    logits: (batch_size, vocab_size)
-    prev_output_tokens: (batch_size, seq_len)
-    """
-    if repetition_penalty == 1.0:
-        return logits
-    score = torch.gather(logits, 1, prev_output_tokens)
-    # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
-    score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty)
-    logits.scatter_(1, prev_output_tokens, score)
-    return logits
-def sample(logits, top_k=1, top_p=0.0, min_p=0.0, temperature=1.0):
-    """Sample from top-k logits.
-    Arguments:
-        logits: Tensor of shape (batch_size, vocab_size)
-    """
-    if top_k == 1:  # Short-circuit for greedy decoding
-        return logits.argmax(dim=-1)
-    else:
-        if top_p > 0.0:
-            assert top_p <= 1.0, "top-p should be in (0, 1]."
-        if top_k > 0:
-            top_k = min(top_k, logits.size(-1))  # Safety check
-            logits_top, indices = torch.topk(logits, top_k, dim=-1)
-            if temperature != 1.0:
-                logits_top /= temperature
-            modify_logits_for_top_p_filtering(logits_top, top_p)
-            return indices[
-                torch.arange(indices.shape[0], device=indices.device),
-                torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1),
-            ]
-        else:
-            if min_p > 0.0:
-                logits_top = logits.clone()
-                max_prob = logits_top[..., 0].item()
-                min_prob = max_prob * min_p
-                modify_logits_for_min_p_filtering(logits_top, min_prob)
-                if temperature != 1.0:
-                    logits_top /= temperature
-                return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1)
-            # Clone so that when we modify for top_p we don't change the original logits
-            logits_top = logits / temperature if temperature != 1.0 else logits.clone()
-            modify_logits_for_top_p_filtering(logits_top, top_p)
-            return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(
-                dim=-1
-            )
-@torch.inference_mode()
-def decode(
-    input_ids,
-    model,
-    max_length,
-    top_k=1,
-    top_p=0.0,
-    min_p=0.0,
-    temperature=1.0,
-    repetition_penalty=1.0,
-    eos_token_id=None,
-    teacher_outputs=None,
-    vocab_size=None,
-    cg=False,
-    enable_timing=False,
-    output_scores=False,
-    streamer: Optional[TextStreamer] = None
-):
-    """Decoding, either greedy or with top-k or top-p sampling.
-    If top-k = 0, don't limit the number of candidates (pure sampling).
-    Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first,
-    then top-p.
-    We assume that all sequences in the same batch have the same length.
-    Arguments:
-        input_ids: (batch, seq_len)
-        max_length: int
-        teacher_outputs (optional): (batch, seq_len). If provided, instead of sampling from the
-            logits, the next token is taken from the teacher_outputs. Useful for testing.
-    Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields:
-        sequences: (batch, max_length)
-        scores: tuples of (batch, vocab_size)
-    """
-    if streamer is not None:
-        streamer.put(input_ids.cpu())
-    batch_size, seqlen_og = input_ids.shape
-    teacher_output_len = teacher_outputs.shape[1] if teacher_outputs is not None else 0
-    if cg:
-        if not hasattr(model, "_decoding_cache"):
-            model._decoding_cache = None
-        model._decoding_cache = update_graph_cache(
-            model,
-            model._decoding_cache,
-            batch_size,
-            seqlen_og,
-            max_length,
-        )
-        inference_params = model._decoding_cache.inference_params
-        inference_params.reset(max_length, batch_size)
-    else:
-        inference_params = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size)
-    def get_logits(input_ids, inference_params):
-        decoding = inference_params.seqlen_offset > 0
-        if decoding:
-            position_ids = torch.full(
-                (batch_size, 1),
-                inference_params.seqlen_offset,
-                dtype=torch.long,
-                device=input_ids.device,
-            )
-        else:
-            position_ids = None
-        if not cg or not decoding:
-            logits = model(
-                input_ids,
-                position_ids=position_ids,
-                inference_params=inference_params,
-                num_last_tokens=1,
-            ).logits.squeeze(dim=1)
-        else:
-            logits = model._decoding_cache.run(
-                input_ids, position_ids, inference_params.seqlen_offset
-            ).squeeze(dim=1)
-        return logits[..., :vocab_size] if vocab_size is not None else logits
-    def sample_tokens(logits, inference_params):
-        if teacher_outputs is None or teacher_output_len <= inference_params.seqlen_offset:
-            token = sample(logits, top_k=top_k, top_p=top_p, min_p=min_p, temperature=temperature)
-        else:
-            token = teacher_outputs[:, inference_params.seqlen_offset]
-        # return rearrange(token, "b -> b 1")
-        return token.unsqueeze(1)
-    def should_stop(current_token, inference_params):
-        if inference_params.seqlen_offset == 0:
-            return False
-        if eos_token_id is not None and (current_token == eos_token_id).all():
-            return True
-        if inference_params.seqlen_offset >= max_length - 1:
-            return True
-        return False
-    start = torch.cuda.Event(enable_timing=enable_timing)
-    end = torch.cuda.Event(enable_timing=enable_timing)
-    if enable_timing:
-        start.record()
-    scores, sequences = [], [input_ids]
-    sequences_cat = input_ids
-    while not should_stop(sequences[-1], inference_params):
-        logits = get_logits(sequences[-1], inference_params)
-        if output_scores:
-            scores.append(logits.clone())
-        inference_params.seqlen_offset += sequences[-1].shape[1]
-        if repetition_penalty == 1.0:
-            sampled_tokens = sample_tokens(logits, inference_params)
-        else:
-            logits = modify_logit_for_repetition_penalty(
-                logits, sequences_cat, repetition_penalty
-            )
-            sampled_tokens = sample_tokens(logits, inference_params)
-            sequences_cat = torch.cat([sequences_cat, sampled_tokens], dim=1)
-        sequences.append(sampled_tokens)
-        if streamer is not None:
-            streamer.put(sampled_tokens.cpu())
-    if streamer is not None:
-        streamer.end()
-    if enable_timing:
-        end.record()
-        torch.cuda.synchronize()
-        print(f"Prompt processing + decoding time: {(start.elapsed_time(end)):.0f}ms")
-    output_cls = GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput
-    return output_cls(sequences=torch.cat(sequences, dim=1), scores=tuple(scores))
-class GenerationMixin:
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        raise NotImplementedError
-    def generate(
-        self,
-        input_ids,
-        max_length,
-        top_k=1,
-        top_p=0.0,
-        min_p=0.0,
-        temperature=1.0,
-        return_dict_in_generate=False,
-        output_scores=False,
-        **kwargs,
-    ):
-        output = decode(
-            input_ids, self, max_length, top_k=top_k, top_p=top_p, min_p = min_p, temperature=temperature, output_scores=output_scores, **kwargs
-        )
-        if not output_scores:
-            output.scores = None
-        return output if return_dict_in_generate else output.sequences
-@dataclass
-class DecodingCGCache:
-    max_batch_size: int = 0
-    max_seqlen: int = 0
-    device = None
-    dtype = None
-    callables: dict = field(default_factory=dict)
-    mempool = None
-    inference_params: Optional[InferenceParams] = None
-    run: Optional[Callable] = None
-@torch.inference_mode()
-def update_graph_cache(
-    model,
-    cache,
-    batch_size,
-    seqlen_og,
-    max_seqlen,
-    decoding_seqlens=(1,),
-    dtype=None,
-    n_warmups=2,
-):
-    if cache is None:
-        cache = DecodingCGCache()
-    param_example = next(iter(model.parameters()))
-    device = param_example.device
-    if dtype is None:
-        dtype = param_example.dtype
-    if (
-        (device, dtype) != (cache.device, cache.dtype)
-        or batch_size > cache.max_batch_size
-        or max_seqlen > cache.max_seqlen
-    ):  # Invalidate the cache
-        cache.callables = {}
-        cache.mempool = None
-        cache.inference_params = None
-        gc.collect()
-        cache.device, cache.dtype = device, dtype
-        cache.max_batch_size, cache.max_seqlen = batch_size, max_seqlen
-        assert hasattr(model, "allocate_inference_cache"), "CUDA graph decoding requires that the model has a method allocate_inference_cache"
-        inf_cache = model.allocate_inference_cache(batch_size, max_seqlen, dtype)
-        lengths_per_sample = torch.full((batch_size,), seqlen_og, dtype=torch.int32, device=device)
-        cache.inference_params = InferenceParams(
-            max_seqlen=max_seqlen,
-            max_batch_size=batch_size,
-            seqlen_offset=seqlen_og,
-            key_value_memory_dict=inf_cache,
-            lengths_per_sample=lengths_per_sample,
-        )
-        cache.mempool = torch.cuda.graphs.graph_pool_handle()
-    for decoding_seqlen in decoding_seqlens:
-        if (batch_size, decoding_seqlen) not in cache.callables:
-            cache.callables[batch_size, decoding_seqlen] = capture_graph(
-                model,
-                cache.inference_params,
-                batch_size,
-                max_seqlen,
-                decoding_seqlen=decoding_seqlen,
-                mempool=cache.mempool,
-                n_warmups=n_warmups,
-            )
-    def dispatch(input_ids, position_ids, seqlen):
-        batch_size, decoding_seqlen = input_ids.shape[:2]
-        return cache.callables[batch_size, decoding_seqlen](input_ids, position_ids, seqlen)
-    cache.run = dispatch
-    cache.inference_params.seqlen_offset = 0  # Reset so it's not confusing
-    return cache
-def capture_graph(
-    model, inference_params, batch_size, max_seqlen, decoding_seqlen=1, mempool=None, n_warmups=2
-):
-    device = next(iter(model.parameters())).device
-    input_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
-    position_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
-    seqlen_offset_og = inference_params.seqlen_offset
-    inference_params.seqlen_offset = max_seqlen - decoding_seqlen
-    inference_params.lengths_per_sample[:] = inference_params.seqlen_offset
-    # Warmup before capture
-    s = torch.cuda.Stream()
-    s.wait_stream(torch.cuda.current_stream())
-    with torch.cuda.stream(s):
-        for _ in range(n_warmups):
-            logits = model(
-                input_ids,
-                position_ids=position_ids,
-                inference_params=inference_params,
-                num_last_tokens=decoding_seqlen,
-            ).logits
-        s.synchronize()
-        # This might be needed for correctness if we run with NCCL_GRAPH_MIXING_SUPPORT=0,
-        # which requires that graph launch and non-captured launch to not overlap (I think,
-        # that's how I interpret the documentation). I'm not sure if this is required.
-        if torch.distributed.is_initialized():
-            torch.distributed.barrier()
-    torch.cuda.current_stream().wait_stream(s)
-    # Captures the graph
-    # To allow capture, automatically sets a side stream as the current stream in the context
-    graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(graph, pool=mempool):
-        logits = model(
-            input_ids,
-            position_ids=position_ids,
-            inference_params=inference_params,
-            num_last_tokens=decoding_seqlen,
-        ).logits
-    def run(new_input_ids, new_position_ids, seqlen):
-        inference_params.lengths_per_sample[:] = seqlen
-        input_ids.copy_(new_input_ids)
-        position_ids.copy_(new_position_ids)
-        graph.replay()
-        return logits.clone()
-    inference_params.seqlen_offset = seqlen_offset_og
-    return run

build/torch210-cxx11-cu126-x86_64-linux/utils/hf.py DELETED Viewed

@@ -1,23 +0,0 @@
-import json
-import torch
-from transformers.utils import WEIGHTS_NAME, CONFIG_NAME
-from transformers.utils.hub import cached_file
-def load_config_hf(model_name):
-    resolved_archive_file = cached_file(model_name, CONFIG_NAME, _raise_exceptions_for_missing_entries=False)
-    return json.load(open(resolved_archive_file))
-def load_state_dict_hf(model_name, device=None, dtype=None):
-    # If not fp32, then we don't want to load directly to the GPU
-    mapped_device = "cpu" if dtype not in [torch.float32, None] else device
-    resolved_archive_file = cached_file(model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False)
-    return torch.load(resolved_archive_file, map_location=mapped_device)
-    # Convert dtype before moving to GPU to save memory
-    if dtype is not None:
-        state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()}
-    state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
-    return state_dict

build/torch210-cxx11-cu126-x86_64-linux/utils/torch.py DELETED Viewed

@@ -1,21 +0,0 @@
-import torch
-from functools import partial
-from typing import Callable
-def custom_amp_decorator(dec: Callable, cuda_amp_deprecated: bool):
-    def decorator(*args, **kwargs):
-        if cuda_amp_deprecated:
-            kwargs["device_type"] = "cuda"
-        return dec(*args, **kwargs)
-    return decorator
-if hasattr(torch.amp, "custom_fwd"): # type: ignore[attr-defined]
-    deprecated = True
-    from torch.amp import custom_fwd, custom_bwd # type: ignore[attr-defined]
-else:
-    deprecated = False
-    from torch.cuda.amp import custom_fwd, custom_bwd
-custom_fwd = custom_amp_decorator(custom_fwd, deprecated)
-custom_bwd = custom_amp_decorator(custom_bwd, deprecated)

build/torch210-cxx11-cu128-x86_64-linux/__init__.py DELETED Viewed

@@ -1,14 +0,0 @@
-__version__ = "2.2.4"
-from .ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
-from .modules.mamba_simple import Mamba
-from .modules.mamba2 import Mamba2
-from .models.mixer_seq_simple import MambaLMHeadModel
-__all__ = [
-    "selective_scan_fn",
-    "mamba_inner_fn",
-    "Mamba",
-    "Mamba2",
-    "MambaLMHeadModel",
-]

build/torch210-cxx11-cu128-x86_64-linux/_mamba_ssm_b2a7fd5.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2cebad781003a612eea29f35ebaf4a1905057ac6e20cdd12a216e4e403b34095
-size 610662240

build/torch210-cxx11-cu128-x86_64-linux/_ops.py DELETED Viewed

@@ -1,9 +0,0 @@
-import torch
-from . import _mamba_ssm_b2a7fd5
-ops = torch.ops._mamba_ssm_b2a7fd5
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_mamba_ssm_b2a7fd5::{op_name}"

build/torch210-cxx11-cu128-x86_64-linux/distributed/__init__.py DELETED Viewed

File without changes

build/torch210-cxx11-cu128-x86_64-linux/distributed/distributed_utils.py DELETED Viewed

@@ -1,144 +0,0 @@
-from typing import Optional
-import torch
-from torch import Tensor
-from torch.distributed import ProcessGroup
-# `all_gather_into_tensor` and `reduce_scatter_tensor` are new placeholders for
-# `_all_gather_base` and `_reduce_scatter_base`. They require the most recent
-# version of PyTorch. The following 4 lines are for backward compatibility with
-# older PyTorch.
-if "all_gather_into_tensor" not in dir(torch.distributed):
-    torch.distributed.all_gather_into_tensor = torch.distributed._all_gather_base
-if "reduce_scatter_tensor" not in dir(torch.distributed):
-    torch.distributed.reduce_scatter_tensor = torch.distributed._reduce_scatter_base
-# Raw operation, does not support autograd, but does support async
-def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
-    world_size = torch.distributed.get_world_size(process_group)
-    output = torch.empty(
-        world_size * input_.shape[0], *input_.shape[1:], dtype=input_.dtype, device=input_.device
-    )
-    handle = torch.distributed.all_gather_into_tensor(
-        output, input_.contiguous(), group=process_group, async_op=async_op
-    )
-    return output, handle
-# Raw operation, does not support autograd, but does support async
-def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
-    world_size = torch.distributed.get_world_size(process_group)
-    assert input_.shape[0] % world_size == 0
-    output = torch.empty(
-        input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device
-    )
-    handle = torch.distributed.reduce_scatter_tensor(
-        output, input_.contiguous(), group=process_group, async_op=async_op
-    )
-    return output, handle
-# Raw operation, does not support autograd, but does support async
-def all_reduce_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
-    input_ = input_.contiguous()
-    handle = torch.distributed.all_reduce(input_, group=process_group, async_op=async_op)
-    return input_, handle
-class AllGatherFunc(torch.autograd.Function):
-    """Gather the input from sequence parallel region and concatenate."""
-    @staticmethod
-    def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor:
-        ctx.process_group = process_group
-        output, _ = all_gather_raw(input_, process_group)
-        return output
-    @staticmethod
-    def backward(ctx, grad_output: Tensor):
-        grad_input, _ = reduce_scatter_raw(grad_output, ctx.process_group)
-        return grad_input, None
-# Supports autograd, but does not support async
-all_gather = AllGatherFunc.apply
-class ReduceScatterFunc(torch.autograd.Function):
-    """Reduce scatter the input from the sequence parallel region and concatenate."""
-    @staticmethod
-    def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor:
-        ctx.process_group = process_group
-        output, _ = reduce_scatter_raw(input_, process_group)
-        return output
-    @staticmethod
-    def backward(ctx, grad_output: Tensor):
-        grad_input, _ = all_gather_raw(grad_output, ctx.process_group)
-        return grad_input, None
-# Supports autograd, but does not support async
-reduce_scatter = ReduceScatterFunc.apply
-class AllReduceFunc(torch.autograd.Function):
-    """Gather the input from sequence parallel region and concatenate."""
-    @staticmethod
-    def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor:
-        ctx.process_group = process_group
-        output, _ = all_reduce_raw(input_, process_group)
-        return output
-    @staticmethod
-    def backward(ctx, grad_output: Tensor):
-        return grad_output, None
-# Supports autograd, but does not support async
-all_reduce = AllReduceFunc.apply
-def sync_shared_params(model: torch.nn.Module, process_group: ProcessGroup):
-    # We want to iterate over parameters with _shared_params=True in the same order,
-    # as different ranks might have different number of parameters (e.g., only rank 0 has bias).
-    pamams_shared = {
-        name: p for name, p in model.named_parameters() if getattr(p, "_shared_params", False)
-    }
-    for _, p in sorted(pamams_shared.items()):
-        with torch.no_grad():
-            # Broadcast needs src to be global rank, not group rank
-            torch.distributed.broadcast(
-                p, src=torch.distributed.get_global_rank(process_group, 0), group=process_group
-            )
-# Ref: https://github.com/NVIDIA/Megatron-LM/blob/52e636888cccc41e931251c417a7181fc36de926/megatron/optimizer/optimizer.py#L256
-def allreduce_sequence_parallel_grad(model: torch.nn.Module, process_group: ProcessGroup):
-    # We want to iterate over parameters with _sequence_parallel=True in the same order,
-    # as different ranks might have different number of parameters (e.g., only rank 0 has bias).
-    params_seqparallel = {
-        name: p for name, p in model.named_parameters() if getattr(p, "_sequence_parallel", False)
-    }
-    grads = [p.grad for _, p in sorted(params_seqparallel.items())]
-    if grads:
-        with torch.no_grad():
-            coalesced = torch._utils._flatten_dense_tensors(grads)
-            torch.distributed.all_reduce(coalesced, group=process_group)
-            for buf, synced in zip(grads, torch._utils._unflatten_dense_tensors(coalesced, grads)):
-                buf.copy_(synced)
-def get_dim_for_local_rank(dim: int, world_size: int, local_rank: int, multiple_of: int = 1) -> int:
-    """Get the dim for the local rank derived from splitting dim on world_size processes.
-    The split may not be even across the world_size processes.
-    """
-    multiple = dim // multiple_of
-    div = multiple // world_size
-    mod = multiple % world_size
-    local_multiple = div + int(local_rank < mod)
-    return local_multiple * multiple_of

build/torch210-cxx11-cu128-x86_64-linux/distributed/tensor_parallel.py DELETED Viewed

@@ -1,296 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-# The TensorParallel linear modules are inspired by https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/layers.py
-from typing import Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from torch.distributed import ProcessGroup
-from ..utils.torch import custom_bwd, custom_fwd
-from einops import rearrange
-from ..distributed.distributed_utils import (
-    all_gather_raw,
-    all_reduce,
-    all_reduce_raw,
-    reduce_scatter,
-    reduce_scatter_raw,
-)
-class ParallelLinearFunc(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, x, weight, bias, process_group=None, sequence_parallel=True):
-        """
-        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
-        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
-        """
-        ctx.compute_weight_gradient = weight.requires_grad
-        ctx.process_group = process_group
-        ctx.sequence_parallel = sequence_parallel
-        if torch.is_autocast_enabled():
-            x = x.to(dtype=torch.get_autocast_gpu_dtype())
-        x = x.contiguous()
-        if process_group is not None and sequence_parallel:
-            # We want to kick off the all_gather early, before weight dtype conversion
-            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-        else:
-            total_x = x
-        if torch.is_autocast_enabled():
-            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
-            bias = bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
-        weight = weight.contiguous()
-        if process_group is not None and sequence_parallel:
-            handle_x.wait()
-        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
-        batch_dim = batch_shape.numel()
-        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
-        output = F.linear(total_x, weight, bias)
-        if ctx.compute_weight_gradient:
-            ctx.save_for_backward(x, weight)
-        else:
-            ctx.save_for_backward(weight)
-        return output
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-        grad_output = grad_output.contiguous()
-        process_group = ctx.process_group
-        sequence_parallel = ctx.sequence_parallel
-        if ctx.compute_weight_gradient:
-            x, weight = ctx.saved_tensors
-            if process_group is not None and sequence_parallel:
-                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-            else:
-                total_x = x
-        else:
-            (weight,) = ctx.saved_tensors
-            total_x = None
-        batch_shape = grad_output.shape[:-1]
-        batch_dim = batch_shape.numel()
-        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
-        if ctx.needs_input_grad[0]:
-            grad_input = F.linear(grad_output, weight.t())
-            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-            if process_group is not None:
-                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
-                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
-        else:
-            grad_input = None
-        if ctx.needs_input_grad[1]:
-            assert ctx.compute_weight_gradient
-            if process_group is not None and sequence_parallel:
-                handle_x.wait()
-            grad_weight = torch.einsum(
-                "bo,bi->oi", grad_output, total_x.reshape(batch_dim, total_x.shape[-1])
-            )
-        else:
-            grad_weight = None
-        grad_bias = grad_output.sum(dim=0) if ctx.needs_input_grad[2] else None
-        if process_group is not None and ctx.needs_input_grad[0]:
-            handle_grad_input.wait()
-        return grad_input, grad_weight, grad_bias, None, None
-def parallel_linear_func(
-    x: Tensor,
-    weight: Tensor,
-    bias: Optional[Tensor] = None,
-    process_group: Optional[ProcessGroup] = None,
-    sequence_parallel: bool = True,
-):
-    return ParallelLinearFunc.apply(x, weight, bias, process_group, sequence_parallel)
-class ColumnParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        if out_features % multiple_of:
-            raise ValueError(f"out_features ({out_features}) must be a multiple of {multiple_of}")
-        multiple = out_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        super().__init__(
-            in_features, local_multiple * multiple_of, bias=bias, device=device, dtype=dtype
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
-        # we do an all_gather of x before doing the matmul.
-        # If not, then the input is already gathered.
-        return parallel_linear_func(
-            x,
-            self.weight,
-            self.bias,
-            process_group=self.process_group,
-            sequence_parallel=self.sequence_parallel,
-        )
-class RowParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        multiple_of=1,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        rank = torch.distributed.get_rank(process_group)
-        if in_features % multiple_of:
-            raise ValueError(f"in_features ({in_features}) must be a multiple of {multiple_of}")
-        multiple = in_features // multiple_of
-        # We want to split @multiple across world_size, but it could be an uneven split
-        div = multiple // world_size
-        mod = multiple % world_size
-        # The first @mod ranks get @div + 1 copies, the rest get @div copies
-        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
-        # Only rank 0 will have bias
-        super().__init__(
-            local_multiple * multiple_of,
-            out_features,
-            bias=bias and rank == 0,
-            device=device,
-            dtype=dtype,
-        )
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-    def forward(self, x):
-        """
-        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
-        a reduce_scatter of the result.
-        """
-        out = parallel_linear_func(x, self.weight, self.bias)
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return reduce_fn(out, self.process_group)
-class VocabParallelEmbedding(nn.Embedding):
-    def __init__(self, num_embeddings, *args, process_group=None, padding_idx=None, **kwargs):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if num_embeddings % world_size != 0:
-                raise ValueError(
-                    f"num_embeddings ({num_embeddings}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-            if world_size > 1 and padding_idx is not None:
-                raise RuntimeError("ParallelEmbedding does not support padding_idx")
-        else:
-            world_size = 1
-        super().__init__(num_embeddings // world_size, *args, padding_idx=padding_idx, **kwargs)
-    def forward(self, input: Tensor) -> Tensor:
-        if self.process_group is None:
-            return super().forward(input)
-        else:
-            rank = torch.distributed.get_rank(self.process_group)
-            vocab_size = self.num_embeddings
-            vocab_start_index, vocab_end_index = rank * vocab_size, (rank + 1) * vocab_size
-            # Create a mask of valid vocab ids (1 means it needs to be masked).
-            input_ids_mask = (input < vocab_start_index) | (input >= vocab_end_index)
-            input = input - vocab_start_index
-            input[input_ids_mask] = 0
-            embeddings = super().forward(input)
-            embeddings[input_ids_mask] = 0.0
-            return embeddings
-class ColumnParallelEmbedding(nn.Embedding):
-    def __init__(self, num_embeddings, embedding_dim, *args, process_group=None, **kwargs):
-        self.process_group = process_group
-        if process_group is not None:
-            world_size = torch.distributed.get_world_size(process_group)
-            if embedding_dim % world_size != 0:
-                raise ValueError(
-                    f"embedding_dim ({embedding_dim}) must be divisible by "
-                    f"world_size ({world_size})"
-                )
-        else:
-            world_size = 1
-        super().__init__(num_embeddings, embedding_dim // world_size, *args, **kwargs)
-class ParallelEmbeddings(nn.Module):
-    def __init__(
-        self,
-        embed_dim,
-        vocab_size,
-        max_position_embeddings,
-        process_group,
-        padding_idx=None,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ):
-        """
-        If max_position_embeddings <= 0, there's no position embeddings
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-        self.word_embeddings = VocabParallelEmbedding(
-            vocab_size,
-            embed_dim,
-            padding_idx=padding_idx,
-            process_group=process_group,
-            **factory_kwargs,
-        )
-        self.max_position_embeddings = max_position_embeddings
-        if self.max_position_embeddings > 0:
-            self.position_embeddings = ColumnParallelEmbedding(
-                max_position_embeddings, embed_dim, process_group=process_group, **factory_kwargs
-            )
-    def forward(self, input_ids, position_ids=None, combine_batch_seqlen_dim=False):
-        """
-        input_ids: (batch, seqlen)
-        position_ids: (batch, seqlen)
-        """
-        batch_size, seqlen = input_ids.shape
-        world_size = torch.distributed.get_world_size(self.process_group)
-        embeddings = self.word_embeddings(input_ids)
-        if self.max_position_embeddings > 0:
-            if position_ids is None:
-                position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device)
-            position_embeddings = self.position_embeddings(position_ids)
-            if world_size <= 1:
-                embeddings = embeddings + position_embeddings
-            else:
-                partition_dim = self.position_embeddings.embedding_dim
-                rank = torch.distributed.get_rank(self.process_group)
-                embeddings[
-                    ..., rank * partition_dim : (rank + 1) * partition_dim
-                ] += position_embeddings
-        if combine_batch_seqlen_dim:
-            embeddings = rearrange(embeddings, "b s d -> (b s) d")
-        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-        return embeddings if world_size <= 1 else reduce_fn(embeddings, self.process_group)

build/torch210-cxx11-cu128-x86_64-linux/mamba_ssm/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu128-x86_64-linux/metadata.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"python-depends":[]}

build/torch210-cxx11-cu128-x86_64-linux/models/__init__.py DELETED Viewed

File without changes

build/torch210-cxx11-cu128-x86_64-linux/models/config_mamba.py DELETED Viewed

@@ -1,18 +0,0 @@
-from dataclasses import dataclass, field
-@dataclass
-class MambaConfig:
-    d_model: int = 2560
-    d_intermediate: int = 0
-    n_layer: int = 64
-    vocab_size: int = 50277
-    ssm_cfg: dict = field(default_factory=dict)
-    attn_layer_idx: list = field(default_factory=list)
-    attn_cfg: dict = field(default_factory=dict)
-    rms_norm: bool = True
-    residual_in_fp32: bool = True
-    fused_add_norm: bool = True
-    pad_vocab_size_multiple: int = 8
-    tie_embeddings: bool = True

build/torch210-cxx11-cu128-x86_64-linux/models/mixer_seq_simple.py DELETED Viewed

@@ -1,309 +0,0 @@
-# Copyright (c) 2023, Albert Gu, Tri Dao.
-import math
-from functools import partial
-import json
-import os
-import copy
-from collections import namedtuple
-import torch
-import torch.nn as nn
-from .config_mamba import MambaConfig
-from ..modules.mamba_simple import Mamba
-from ..modules.mamba2 import Mamba2
-from ..modules.mha import MHA
-from ..modules.mlp import GatedMLP
-from ..modules.block import Block
-from ..utils.generation import GenerationMixin
-from ..utils.hf import load_config_hf, load_state_dict_hf
-try:
-    from ..ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
-except ImportError:
-    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
-def create_block(
-    d_model,
-    d_intermediate,
-    ssm_cfg=None,
-    attn_layer_idx=None,
-    attn_cfg=None,
-    norm_epsilon=1e-5,
-    rms_norm=False,
-    residual_in_fp32=False,
-    fused_add_norm=False,
-    layer_idx=None,
-    device=None,
-    dtype=None,
-):
-    if ssm_cfg is None:
-        ssm_cfg = {}
-    if attn_layer_idx is None:
-        attn_layer_idx = []
-    if attn_cfg is None:
-        attn_cfg = {}
-    factory_kwargs = {"device": device, "dtype": dtype}
-    if layer_idx not in attn_layer_idx:
-        # Create a copy of the config to modify
-        ssm_cfg = copy.deepcopy(ssm_cfg) if ssm_cfg is not None else {}
-        ssm_layer = ssm_cfg.pop("layer", "Mamba1")
-        if ssm_layer not in ["Mamba1", "Mamba2"]:
-            raise ValueError(f"Invalid ssm_layer: {ssm_layer}, only support Mamba1 and Mamba2")
-        mixer_cls = partial(
-            Mamba2 if ssm_layer == "Mamba2" else Mamba,
-            layer_idx=layer_idx,
-            **ssm_cfg,
-            **factory_kwargs
-        )
-    else:
-        mixer_cls = partial(MHA, layer_idx=layer_idx, **attn_cfg, **factory_kwargs)
-    norm_cls = partial(
-        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
-    )
-    if d_intermediate == 0:
-        mlp_cls = nn.Identity
-    else:
-        mlp_cls = partial(
-            GatedMLP, hidden_features=d_intermediate, out_features=d_model, **factory_kwargs
-        )
-    block = Block(
-        d_model,
-        mixer_cls,
-        mlp_cls,
-        norm_cls=norm_cls,
-        fused_add_norm=fused_add_norm,
-        residual_in_fp32=residual_in_fp32,
-    )
-    block.layer_idx = layer_idx
-    return block
-# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
-def _init_weights(
-    module,
-    n_layer,
-    initializer_range=0.02,  # Now only used for embedding layer.
-    rescale_prenorm_residual=True,
-    n_residuals_per_layer=1,  # Change to 2 if we have MLP
-):
-    if isinstance(module, nn.Linear):
-        if module.bias is not None:
-            if not getattr(module.bias, "_no_reinit", False):
-                nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
-        nn.init.normal_(module.weight, std=initializer_range)
-    if rescale_prenorm_residual:
-        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-        #
-        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-        for name, p in module.named_parameters():
-            if name in ["out_proj.weight", "fc2.weight"]:
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                # We need to reinit p since this code could be called multiple times
-                # Having just p *= scale would repeatedly scale it down
-                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
-                with torch.no_grad():
-                    p /= math.sqrt(n_residuals_per_layer * n_layer)
-class MixerModel(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        n_layer: int,
-        d_intermediate: int,
-        vocab_size: int,
-        ssm_cfg=None,
-        attn_layer_idx=None,
-        attn_cfg=None,
-        norm_epsilon: float = 1e-5,
-        rms_norm: bool = False,
-        initializer_cfg=None,
-        fused_add_norm=False,
-        residual_in_fp32=False,
-        device=None,
-        dtype=None,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.residual_in_fp32 = residual_in_fp32
-        self.embedding = nn.Embedding(vocab_size, d_model, **factory_kwargs)
-        # We change the order of residual and layer norm:
-        # Instead of LN -> Attn / MLP -> Add, we do:
-        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
-        # the main branch (output of MLP / Mixer). The model definition is unchanged.
-        # This is for performance reason: we can fuse add + layer_norm.
-        self.fused_add_norm = fused_add_norm
-        if self.fused_add_norm:
-            if layer_norm_fn is None or rms_norm_fn is None:
-                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
-        self.layers = nn.ModuleList(
-            [
-                create_block(
-                    d_model,
-                    d_intermediate=d_intermediate,
-                    ssm_cfg=ssm_cfg,
-                    attn_layer_idx=attn_layer_idx,
-                    attn_cfg=attn_cfg,
-                    norm_epsilon=norm_epsilon,
-                    rms_norm=rms_norm,
-                    residual_in_fp32=residual_in_fp32,
-                    fused_add_norm=fused_add_norm,
-                    layer_idx=i,
-                    **factory_kwargs,
-                )
-                for i in range(n_layer)
-            ]
-        )
-        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
-            d_model, eps=norm_epsilon, **factory_kwargs
-        )
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-                n_residuals_per_layer=1 if d_intermediate == 0 else 2,  # 2 if we have MLP
-            )
-        )
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return {
-            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
-            for i, layer in enumerate(self.layers)
-        }
-    def forward(self, input_ids, inference_params=None, **mixer_kwargs):
-        hidden_states = self.embedding(input_ids)
-        residual = None
-        for layer in self.layers:
-            hidden_states, residual = layer(
-                hidden_states, residual, inference_params=inference_params, **mixer_kwargs
-            )
-        if not self.fused_add_norm:
-            residual = (hidden_states + residual) if residual is not None else hidden_states
-            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
-        else:
-            # Set prenorm=False here since we don't need the residual
-            hidden_states = layer_norm_fn(
-                hidden_states,
-                self.norm_f.weight,
-                self.norm_f.bias,
-                eps=self.norm_f.eps,
-                residual=residual,
-                prenorm=False,
-                residual_in_fp32=self.residual_in_fp32,
-                is_rms_norm=isinstance(self.norm_f, RMSNorm)
-            )
-        return hidden_states
-class MambaLMHeadModel(nn.Module, GenerationMixin):
-    def __init__(
-        self,
-        config: MambaConfig,
-        initializer_cfg=None,
-        device=None,
-        dtype=None,
-    ) -> None:
-        self.config = config
-        d_model = config.d_model
-        n_layer = config.n_layer
-        d_intermediate = config.d_intermediate
-        vocab_size = config.vocab_size
-        ssm_cfg = config.ssm_cfg
-        attn_layer_idx = config.attn_layer_idx
-        attn_cfg = config.attn_cfg
-        rms_norm = config.rms_norm
-        residual_in_fp32 = config.residual_in_fp32
-        fused_add_norm = config.fused_add_norm
-        pad_vocab_size_multiple = config.pad_vocab_size_multiple
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        if vocab_size % pad_vocab_size_multiple != 0:
-            vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
-        self.backbone = MixerModel(
-            d_model=d_model,
-            n_layer=n_layer,
-            d_intermediate=d_intermediate,
-            vocab_size=vocab_size,
-            ssm_cfg=ssm_cfg,
-            attn_layer_idx=attn_layer_idx,
-            attn_cfg=attn_cfg,
-            rms_norm=rms_norm,
-            initializer_cfg=initializer_cfg,
-            fused_add_norm=fused_add_norm,
-            residual_in_fp32=residual_in_fp32,
-            **factory_kwargs,
-        )
-        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
-        # Initialize weights and apply final processing
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-        self.tie_weights()
-    def tie_weights(self):
-        if self.config.tie_embeddings:
-            self.lm_head.weight = self.backbone.embedding.weight
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.backbone.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
-    def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0, **mixer_kwargs):
-        """
-        "position_ids" is just to be compatible with Transformer generation. We don't use it.
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        hidden_states = self.backbone(input_ids, inference_params=inference_params, **mixer_kwargs)
-        if num_last_tokens > 0:
-            hidden_states = hidden_states[:, -num_last_tokens:]
-        lm_logits = self.lm_head(hidden_states)
-        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
-        return CausalLMOutput(logits=lm_logits)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
-        config_data = load_config_hf(pretrained_model_name)
-        config = MambaConfig(**config_data)
-        model = cls(config, device=device, dtype=dtype, **kwargs)
-        model.load_state_dict(load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype))
-        return model
-    def save_pretrained(self, save_directory):
-        """
-        Minimal implementation of save_pretrained for MambaLMHeadModel.
-        Save the model and its configuration file to a directory.
-        """
-        # Ensure save_directory exists
-        os.makedirs(save_directory, exist_ok=True)
-        # Save the model's state_dict
-        model_path = os.path.join(save_directory, 'pytorch_model.bin')
-        torch.save(self.state_dict(), model_path)
-        # Save the configuration of the model
-        config_path = os.path.join(save_directory, 'config.json')
-        with open(config_path, 'w') as f:
-            json.dump(self.config.__dict__, f, indent=4)

build/torch210-cxx11-cu128-x86_64-linux/modules/__init__.py DELETED Viewed

File without changes

build/torch210-cxx11-cu128-x86_64-linux/modules/block.py DELETED Viewed

@@ -1,107 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-from typing import Optional
-import torch
-from torch import nn, Tensor
-from ..ops.triton.layer_norm import RMSNorm, layer_norm_fn
-class Block(nn.Module):
-    def __init__(
-        self,
-        dim,
-        mixer_cls,
-        mlp_cls,
-        norm_cls=nn.LayerNorm,
-        fused_add_norm=False,
-        residual_in_fp32=False,
-    ):
-        """
-        Simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection"
-        This Block has a slightly different structure compared to a regular
-        prenorm Transformer block.
-        The standard block is: LN -> MHA/MLP -> Add.
-        [Ref: https://arxiv.org/abs/2002.04745]
-        Here we have: Add -> LN -> Mixer, returning both
-        the hidden_states (output of the mixer) and the residual.
-        This is purely for performance reasons, as we can fuse add and LayerNorm.
-        The residual needs to be provided (except for the very first block).
-        """
-        super().__init__()
-        self.residual_in_fp32 = residual_in_fp32
-        self.fused_add_norm = fused_add_norm
-        self.norm = norm_cls(dim)
-        self.mixer = mixer_cls(dim)
-        if mlp_cls is not nn.Identity:
-            self.norm2 = norm_cls(dim)
-            self.mlp = mlp_cls(dim)
-        else:
-            self.mlp = None
-        if self.fused_add_norm:
-            assert RMSNorm is not None, "RMSNorm import fails"
-            assert isinstance(
-                self.norm, (nn.LayerNorm, RMSNorm)
-            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
-    def forward(
-        self,
-        hidden_states: Tensor,
-        residual: Optional[Tensor] = None,
-        inference_params=None,
-        **mixer_kwargs
-    ):
-        r"""Pass the input through the encoder layer.
-        Args:
-            hidden_states: the sequence to the encoder layer (required).
-            residual: hidden_states = Mixer(LN(residual))
-        """
-        if not self.fused_add_norm:
-            residual = (
-                (hidden_states + residual) if residual is not None else hidden_states
-            )
-            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
-            if self.residual_in_fp32:
-                residual = residual.to(torch.float32)
-        else:
-            hidden_states, residual = layer_norm_fn(
-                hidden_states,
-                self.norm.weight,
-                self.norm.bias,
-                residual=residual,
-                prenorm=True,
-                residual_in_fp32=self.residual_in_fp32,
-                eps=self.norm.eps,
-                is_rms_norm=isinstance(self.norm, RMSNorm),
-            )
-        hidden_states = self.mixer(
-            hidden_states, inference_params=inference_params, **mixer_kwargs
-        )
-        if self.mlp is not None:
-            if not self.fused_add_norm:
-                residual = hidden_states + residual
-                hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
-                if self.residual_in_fp32:
-                    residual = residual.to(torch.float32)
-            else:
-                hidden_states, residual = layer_norm_fn(
-                    hidden_states,
-                    self.norm2.weight,
-                    self.norm2.bias,
-                    residual=residual,
-                    prenorm=True,
-                    residual_in_fp32=self.residual_in_fp32,
-                    eps=self.norm2.eps,
-                    is_rms_norm=isinstance(self.norm2, RMSNorm),
-                )
-            hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.mixer.allocate_inference_cache(
-            batch_size, max_seqlen, dtype=dtype, **kwargs
-        )

build/torch210-cxx11-cu128-x86_64-linux/modules/mamba2.py DELETED Viewed

@@ -1,502 +0,0 @@
-# Copyright (c) 2024, Tri Dao, Albert Gu.
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange, repeat
-try:
-    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
-except ImportError:
-    causal_conv1d_fn, causal_conv1d_update = None, None
-try:
-    from causal_conv1d.causal_conv1d_varlen import causal_conv1d_varlen_states
-except ImportError:
-    causal_conv1d_varlen_states = None
-try:
-    from ..ops.triton.selective_state_update import selective_state_update
-except ImportError:
-    selective_state_update = None
-from ..ops.triton.layernorm_gated import RMSNorm as RMSNormGated
-from ..distributed.tensor_parallel import ColumnParallelLinear, RowParallelLinear
-from ..distributed.distributed_utils import all_reduce, reduce_scatter
-from ..ops.triton.ssd_combined import mamba_chunk_scan_combined
-from ..ops.triton.ssd_combined import mamba_split_conv1d_scan_combined
-from huggingface_hub import PyTorchModelHubMixin
-class Mamba2(nn.Module, PyTorchModelHubMixin):
-    def __init__(
-        self,
-        d_model,
-        d_state=128,
-        d_conv=4,
-        conv_init=None,
-        expand=2,
-        headdim=64,
-        d_ssm=None,  # If not None, we only apply SSM on this many dimensions, the rest uses gated MLP
-        ngroups=1,
-        A_init_range=(1, 16),
-        D_has_hdim=False,
-        rmsnorm=True,
-        norm_before_gate=False,
-        dt_min=0.001,
-        dt_max=0.1,
-        dt_init_floor=1e-4,
-        dt_limit=(0.0, float("inf")),
-        bias=False,
-        conv_bias=True,
-        # Fused kernel and sharding options
-        chunk_size=256,
-        use_mem_eff_path=True,
-        layer_idx=None,  # Absorb kwarg for general module
-        process_group=None,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.d_model = d_model
-        self.d_state = d_state
-        self.d_conv = d_conv
-        self.conv_init = conv_init
-        self.expand = expand
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-        self.world_size = 1 if process_group is None else process_group.size()
-        self.local_rank = 0 if process_group is None else process_group.rank()
-        self.d_inner = (self.expand * self.d_model) // self.world_size
-        assert self.d_inner * self.world_size == self.expand * self.d_model
-        self.headdim = headdim
-        self.d_ssm = self.d_inner if d_ssm is None else d_ssm // self.world_size
-        assert ngroups % self.world_size == 0
-        self.ngroups = ngroups // self.world_size
-        assert self.d_ssm % self.headdim == 0
-        self.nheads = self.d_ssm // self.headdim
-        self.D_has_hdim = D_has_hdim
-        self.rmsnorm = rmsnorm
-        self.norm_before_gate = norm_before_gate
-        self.dt_limit = dt_limit
-        self.activation = "silu"
-        self.chunk_size = chunk_size
-        self.use_mem_eff_path = use_mem_eff_path
-        self.layer_idx = layer_idx
-        # Order: [z, x, B, C, dt]
-        d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
-        if self.process_group is None:
-            self.in_proj = nn.Linear(
-                self.d_model, d_in_proj, bias=bias, **factory_kwargs
-            )
-        else:
-            self.in_proj = ColumnParallelLinear(
-                self.d_model,
-                d_in_proj * self.world_size,
-                bias=bias,
-                process_group=self.process_group,
-                sequence_parallel=self.sequence_parallel,
-                **factory_kwargs,
-            )
-        conv_dim = self.d_ssm + 2 * self.ngroups * self.d_state
-        self.conv1d = nn.Conv1d(
-            in_channels=conv_dim,
-            out_channels=conv_dim,
-            bias=conv_bias,
-            kernel_size=d_conv,
-            groups=conv_dim,
-            padding=d_conv - 1,
-            **factory_kwargs,
-        )
-        if self.conv_init is not None:
-            nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
-        self.act = nn.SiLU()
-        # Initialize log dt bias
-        dt = torch.exp(
-            torch.rand(self.nheads, **factory_kwargs)
-            * (math.log(dt_max) - math.log(dt_min))
-            + math.log(dt_min)
-        )
-        dt = torch.clamp(dt, min=dt_init_floor)
-        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-        inv_dt = dt + torch.log(-torch.expm1(-dt))
-        self.dt_bias = nn.Parameter(inv_dt)
-        # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
-        # name.endswith("bias") in param_grouping.py
-        self.dt_bias._no_weight_decay = True
-        assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
-        A = torch.empty(self.nheads, dtype=torch.float32, device=device).uniform_(
-            *A_init_range
-        )
-        A_log = torch.log(A).to(dtype=dtype)
-        self.A_log = nn.Parameter(A_log)
-        self.A_log._no_weight_decay = True
-        # D "skip" parameter
-        self.D = nn.Parameter(
-            torch.ones(self.d_ssm if self.D_has_hdim else self.nheads, device=device)
-        )
-        self.D._no_weight_decay = True
-        if self.rmsnorm:
-            assert RMSNormGated is not None
-            self.norm = RMSNormGated(
-                self.d_ssm,
-                eps=1e-5,
-                norm_before_gate=self.norm_before_gate,
-                group_size=self.d_ssm // ngroups,
-                **factory_kwargs,
-            )
-        if self.process_group is None:
-            self.out_proj = nn.Linear(
-                self.d_inner, self.d_model, bias=bias, **factory_kwargs
-            )
-        else:
-            self.out_proj = RowParallelLinear(
-                self.d_inner * self.world_size,
-                self.d_model,
-                bias=bias,
-                process_group=self.process_group,
-                sequence_parallel=self.sequence_parallel,
-                **factory_kwargs,
-            )
-    def forward(
-        self, u, seqlen=None, seq_idx=None, cu_seqlens=None, inference_params=None
-    ):
-        """
-        u: (batch, seqlen, hidden_dim) if seqlen=None.
-            If seqlen is not None, u is (batch * seqlen, hidden_dim). This is so that when we
-            split u during sequence parallel, we split the batch * seqlen dimension
-            (in case batch is small).
-        Returns: same shape as u
-        """
-        seqlen_og = seqlen
-        if seqlen is None:
-            batch, seqlen, dim = u.shape
-        else:
-            batch_seqlen, dim = u.shape
-            batch = batch_seqlen // seqlen
-        conv_state, ssm_state = None, None
-        if inference_params is not None:
-            inference_batch = (
-                cu_seqlens.shape[0] - 1 if cu_seqlens is not None else batch
-            )
-            conv_state, ssm_state = self._get_states_from_cache(
-                inference_params, inference_batch
-            )
-            if inference_params.seqlen_offset > 0:
-                # The states are updated inplace
-                out, _, _ = self.step(u, conv_state, ssm_state)
-                return out
-        zxbcdt = self.in_proj(u)  # (B, L, d_in_proj) or (B * L, d_in_proj)
-        if seqlen_og is not None:
-            zxbcdt = rearrange(zxbcdt, "(b l) d -> b l d", l=seqlen)
-        # If the model is loaded in fp16, without the .float() here, A might be -inf
-        A = -torch.exp(self.A_log.float())  # (nheads) or (d_inner, d_state)
-        dt_limit_kwargs = (
-            {} if self.dt_limit == (0.0, float("inf")) else dict(dt_limit=self.dt_limit)
-        )
-        if self.use_mem_eff_path and inference_params is None:
-            out = mamba_split_conv1d_scan_combined(
-                zxbcdt,
-                rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                self.conv1d.bias,
-                self.dt_bias,
-                A,
-                D=(
-                    rearrange(self.D, "(h p) -> h p", p=self.headdim)
-                    if self.D_has_hdim
-                    else self.D
-                ),
-                chunk_size=self.chunk_size,
-                seq_idx=seq_idx,
-                activation=self.activation,
-                rmsnorm_weight=self.norm.weight if self.rmsnorm else None,
-                rmsnorm_eps=self.norm.eps if self.rmsnorm else 1e-6,
-                outproj_weight=self.out_proj.weight,
-                outproj_bias=self.out_proj.bias,
-                headdim=None if self.D_has_hdim else self.headdim,
-                ngroups=self.ngroups,
-                norm_before_gate=self.norm_before_gate,
-                **dt_limit_kwargs,
-            )
-            if seqlen_og is not None:
-                out = rearrange(out, "b l d -> (b l) d")
-            if self.process_group is not None:
-                reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-                out = reduce_fn(out, self.process_group)
-        else:
-            d_mlp = (
-                zxbcdt.shape[-1]
-                - 2 * self.d_ssm
-                - 2 * self.ngroups * self.d_state
-                - self.nheads
-            ) // 2
-            z0, x0, z, xBC, dt = torch.split(
-                zxbcdt,
-                [
-                    d_mlp,
-                    d_mlp,
-                    self.d_ssm,
-                    self.d_ssm + 2 * self.ngroups * self.d_state,
-                    self.nheads,
-                ],
-                dim=-1,
-            )
-            if conv_state is not None:
-                if cu_seqlens is None:
-                    # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
-                    # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
-                    xBC_t = rearrange(xBC, "b l d -> b d l")
-                    conv_state.copy_(
-                        F.pad(xBC_t, (self.d_conv - xBC_t.shape[-1], 0))
-                    )  # Update state (B D W)
-                else:
-                    assert (
-                        causal_conv1d_varlen_states is not None
-                    ), "varlen inference requires causal_conv1d package"
-                    assert (
-                        batch == 1
-                    ), "varlen inference only supports batch dimension 1"
-                    conv_varlen_states = causal_conv1d_varlen_states(
-                        xBC.squeeze(0), cu_seqlens, state_len=conv_state.shape[-1]
-                    )
-                    conv_state.copy_(conv_varlen_states)
-            assert self.activation in ["silu", "swish"]
-            if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
-                assert (
-                    seq_idx is None
-                ), "varlen conv1d requires the causal_conv1d package"
-                xBC = self.act(
-                    self.conv1d(xBC.transpose(1, 2)).transpose(1, 2)[
-                        :, : -(self.d_conv - 1)
-                    ]
-                )  # (B, L, self.d_ssm + 2 * ngroups * d_state)
-            else:
-                xBC = causal_conv1d_fn(
-                    xBC.transpose(1, 2),
-                    rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                    bias=self.conv1d.bias,
-                    activation=self.activation,
-                    seq_idx=seq_idx,
-                ).transpose(1, 2)
-            x, B, C = torch.split(
-                xBC,
-                [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state],
-                dim=-1,
-            )
-            y = mamba_chunk_scan_combined(
-                rearrange(x, "b l (h p) -> b l h p", p=self.headdim),
-                dt,
-                A,
-                rearrange(B, "b l (g n) -> b l g n", g=self.ngroups),
-                rearrange(C, "b l (g n) -> b l g n", g=self.ngroups),
-                chunk_size=self.chunk_size,
-                D=(
-                    rearrange(self.D, "(h p) -> h p", p=self.headdim)
-                    if self.D_has_hdim
-                    else self.D
-                ),
-                z=(
-                    rearrange(z, "b l (h p) -> b l h p", p=self.headdim)
-                    if not self.rmsnorm
-                    else None
-                ),
-                dt_bias=self.dt_bias,
-                dt_softplus=True,
-                seq_idx=seq_idx,
-                cu_seqlens=cu_seqlens,
-                **dt_limit_kwargs,
-                return_final_states=ssm_state is not None,
-                return_varlen_states=cu_seqlens is not None
-                and inference_params is not None,
-            )
-            if ssm_state is not None:
-                y, last_state, *rest = y
-                if cu_seqlens is None:
-                    ssm_state.copy_(last_state)
-                else:
-                    varlen_states = rest[0]
-                    ssm_state.copy_(varlen_states)
-            y = rearrange(y, "b l h p -> b l (h p)")
-            if self.rmsnorm:
-                y = self.norm(y, z)
-            if d_mlp > 0:
-                y = torch.cat([F.silu(z0) * x0, y], dim=-1)
-            if seqlen_og is not None:
-                y = rearrange(y, "b l d -> (b l) d")
-            out = self.out_proj(y)
-        return out
-    def step(self, hidden_states, conv_state, ssm_state):
-        dtype = hidden_states.dtype
-        assert (
-            hidden_states.shape[1] == 1
-        ), "Only support decoding with 1 token at a time for now"
-        zxbcdt = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
-        d_mlp = (
-            zxbcdt.shape[-1]
-            - 2 * self.d_ssm
-            - 2 * self.ngroups * self.d_state
-            - self.nheads
-        ) // 2
-        z0, x0, z, xBC, dt = torch.split(
-            zxbcdt,
-            [
-                d_mlp,
-                d_mlp,
-                self.d_ssm,
-                self.d_ssm + 2 * self.ngroups * self.d_state,
-                self.nheads,
-            ],
-            dim=-1,
-        )
-        # Conv step
-        if causal_conv1d_update is None:
-            conv_state.copy_(
-                torch.roll(conv_state, shifts=-1, dims=-1)
-            )  # Update state (B D W)
-            conv_state[:, :, -1] = xBC
-            xBC = torch.sum(
-                conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1
-            )  # (B D)
-            if self.conv1d.bias is not None:
-                xBC = xBC + self.conv1d.bias
-            xBC = self.act(xBC).to(dtype=dtype)
-        else:
-            xBC = causal_conv1d_update(
-                xBC,
-                conv_state,
-                rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                self.conv1d.bias,
-                self.activation,
-            )
-        x, B, C = torch.split(
-            xBC,
-            [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state],
-            dim=-1,
-        )
-        A = -torch.exp(self.A_log.float())  # (nheads,)
-        # SSM step
-        if selective_state_update is None:
-            assert (
-                self.ngroups == 1
-            ), "Only support ngroups=1 for this inference code path"
-            # Discretize A and B
-            dt = F.softplus(dt + self.dt_bias.to(dtype=dt.dtype))  # (batch, nheads)
-            dA = torch.exp(dt * A)  # (batch, nheads)
-            x = rearrange(x, "b (h p) -> b h p", p=self.headdim)
-            dBx = torch.einsum("bh,bn,bhp->bhpn", dt, B, x)
-            ssm_state.copy_(ssm_state * rearrange(dA, "b h -> b h 1 1") + dBx)
-            y = torch.einsum("bhpn,bn->bhp", ssm_state.to(dtype), C)
-            y = y + rearrange(self.D.to(dtype), "h -> h 1") * x
-            y = rearrange(y, "b h p -> b (h p)")
-            if not self.rmsnorm:
-                y = y * self.act(z)  # (B D)
-        else:
-            A = repeat(A, "h -> h p n", p=self.headdim, n=self.d_state).to(
-                dtype=torch.float32
-            )
-            dt = repeat(dt, "b h -> b h p", p=self.headdim)
-            dt_bias = repeat(self.dt_bias, "h -> h p", p=self.headdim)
-            D = repeat(self.D, "h -> h p", p=self.headdim)
-            B = rearrange(B, "b (g n) -> b g n", g=self.ngroups)
-            C = rearrange(C, "b (g n) -> b g n", g=self.ngroups)
-            x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim)
-            if not self.rmsnorm:
-                z = rearrange(z, "b (h p) -> b h p", p=self.headdim)
-            y = selective_state_update(
-                ssm_state,
-                x_reshaped,
-                dt,
-                A,
-                B,
-                C,
-                D,
-                z=z if not self.rmsnorm else None,
-                dt_bias=dt_bias,
-                dt_softplus=True,
-            )
-            y = rearrange(y, "b h p -> b (h p)")
-        if self.rmsnorm:
-            y = self.norm(y, z)
-        if d_mlp > 0:
-            y = torch.cat([F.silu(z0) * x0, y], dim=-1)
-        out = self.out_proj(y)
-        return out.unsqueeze(1), conv_state, ssm_state
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        device = self.out_proj.weight.device
-        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
-        conv_state = torch.zeros(
-            batch_size,
-            self.d_conv,
-            self.conv1d.weight.shape[0],
-            device=device,
-            dtype=conv_dtype,
-        ).transpose(1, 2)
-        ssm_dtype = self.in_proj.weight.dtype if dtype is None else dtype
-        ssm_state = torch.zeros(
-            batch_size,
-            self.nheads,
-            self.headdim,
-            self.d_state,
-            device=device,
-            dtype=ssm_dtype,
-        )
-        return conv_state, ssm_state
-    def _get_states_from_cache(
-        self, inference_params, batch_size, initialize_states=False
-    ):
-        assert self.layer_idx is not None
-        if self.layer_idx not in inference_params.key_value_memory_dict:
-            batch_shape = (batch_size,)
-            conv_state = torch.zeros(
-                batch_size,
-                self.d_conv,
-                self.conv1d.weight.shape[0],
-                device=self.conv1d.weight.device,
-                dtype=self.conv1d.weight.dtype,
-            ).transpose(1, 2)
-            ssm_state = torch.zeros(
-                batch_size,
-                self.nheads,
-                self.headdim,
-                self.d_state,
-                device=self.in_proj.weight.device,
-                dtype=self.in_proj.weight.dtype,
-            )
-            inference_params.key_value_memory_dict[self.layer_idx] = (
-                conv_state,
-                ssm_state,
-            )
-        else:
-            conv_state, ssm_state = inference_params.key_value_memory_dict[
-                self.layer_idx
-            ]
-            # TODO: What if batch size changes between generation, and we reuse the same states?
-            if initialize_states:
-                conv_state.zero_()
-                ssm_state.zero_()
-        return conv_state, ssm_state