configs added

Browse files

Files changed (15) hide show

__init__.py +2 -0
added_tokens.json +3 -0
attn.py +206 -0
casual_conv1d_compilable.py +214 -0
config.json +75 -0
configuration_minimamba.py +156 -0
merges.txt +0 -0
model.py +788 -0
modeling_minimamba.py +223 -0
norms.py +358 -0
special_tokens_map.json +23 -0
ssm_compilable.py +221 -0
tokenizer.json +0 -0
tokenizer_config.json +27 -0
vocab.json +0 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .configuration_minimamba import MiniMambaConfig
2	+ from .modeling_minimamba import MiniMamba

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "<|endofprompt|>": 200018
+}

attn.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from flash_attn import flash_attn_func
+except ImportError as e:
+    print(
+        f"Unable to import Triton-based flash attention: {e}. No alternative currently available."
+    )
+def nearest_power_of_two(x: int, round_up: bool = False) -> int:
+    return (
+        1 << math.floor(math.log2(x)) if not round_up else 1 << math.ceil(math.log2(x))
+    )
+def _generate_slopes(self, n: int):
+        start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+        return [start * (start**i) for i in range(n)]
+def _get_alibi_slopes(self, n_heads: int, interpolation_factor: float = 0.25):
+    # If n_heads is a power of 2, generate slopes directly
+    if math.log2(n_heads).is_integer():
+        slopes = self._generate_slopes(n_heads)
+    else:
+        # Get slopes for the nearest power of two
+        n = nearest_power_of_two(n_heads, round_up=False)
+        slopes_power_of_two = self._generate_slopes(n)
+        # Generate extra slopes
+        extra_slopes = self._generate_slopes(2 * n)
+        extra_slopes_trunc = extra_slopes[0::2][: n_heads - n]
+        slopes = slopes_power_of_two + extra_slopes_trunc
+    slopes = torch.tensor(slopes, device=self.device)
+    slopes = slopes * interpolation_factor  # https://arxiv.org/pdf/2310.13017
+    return slopes
+def precompute_freqs_cis(head_dim: int, max_seq_len: int, theta: float = 10000.0):
+    # For half the dimensions, build the scale factor:
+    freq_seq = torch.arange(0, head_dim, 2).float() / head_dim
+    freqs = 1.0 / (theta ** freq_seq)
+    # Outer product with positions
+    t = torch.arange(max_seq_len, dtype=torch.float32)
+    angles = torch.outer(t, freqs)
+    # Build a complex exponential e^{i * theta}
+    freqs_cis = torch.polar(
+        torch.ones_like(angles),
+        angles
+    )
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    """
+    x is [B, n_heads, seq_len, head_dim_as_complex],
+    so we want to broadcast freqs_cis from [max_seq_len, half_dim]
+    to [1, 1, seq_len, half_dim].
+    """
+    seq_len = x.shape[2]
+    freqs_cis = freqs_cis[:seq_len]  # slice down to current seq_len
+    return freqs_cis.view(1, 1, seq_len, -1)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # Convert real -> complex by grouping last dim in pairs
+    # shape => [B, n_heads, seq_len, head_dim//2, 2] => complex => [B, n_heads, seq_len, head_dim//2]
+    xq_complex = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_complex = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    # Broadcast the frequencies to match [B, n_heads, seq_len, head_dim//2]
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_complex)
+    # Multiply => apply rotation
+    xq_complex = xq_complex * freqs_cis
+    xk_complex = xk_complex * freqs_cis
+    # Convert back to real => shape [B, n_heads, seq_len, head_dim]
+    xq_out = torch.view_as_real(xq_complex).reshape(*xq.shape)
+    xk_out = torch.view_as_real(xk_complex).reshape(*xk.shape)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class Attention(nn.Module):
+    def __init__(self, config):
+        super(Attention, self).__init__()
+        self.dim, self.num_heads = config.dim, config.num_heads
+        assert config.dim % config.num_heads == 0, f"dim ({self.dim}) must be divisible num_heads ({self.num_heads})"
+        self.head_dim = config.dim // config.num_heads
+        self.c_attn = nn.Linear(self.dim, 3*self.dim, bias=config.bias)
+        self.c_proj = nn.Linear(config.dim, config.dim, bias=config.bias)
+        self.c_proj.SCALE_INIT = 1
+        self.alibi_slopes = self._get_alibi_slopes(self.num_heads) if config.use_alibi else None
+        self.window_size = config.window_size
+        self.softcap = config.softcap
+        self.dropout = config.dropout
+        self.resid_dropout = nn.Dropout(self.dropout)
+    def _generate_slopes(self, n: int):
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            return [start * (start**i) for i in range(n)]
+    def _get_alibi_slopes(self, num_heads: int, interpolation_factor: float = 0.25):
+        # If n_heads is a power of 2, generate slopes directly
+        if math.log2(num_heads).is_integer():
+            slopes = self._generate_slopes(num_heads)
+        else:
+            # Get slopes for the nearest power of two
+            n = nearest_power_of_two(num_heads, round_up=False)
+            slopes_power_of_two = self._generate_slopes(n)
+            # Generate extra slopes
+            extra_slopes = self._generate_slopes(2 * n)
+            extra_slopes_trunc = extra_slopes[0::2][: num_heads - n]
+            slopes = slopes_power_of_two + extra_slopes_trunc
+        slopes = torch.tensor(slopes, device=torch.device("cuda"))
+        slopes = slopes * interpolation_factor  # https://arxiv.org/pdf/2310.13017
+        return slopes
+    def forward(
+        self,
+        x: torch.Tensor = None,
+        q: torch.Tensor = None,
+        k: torch.Tensor = None,
+        v: torch.Tensor = None,
+        freqs_cis: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if x is not None:
+            q = k = v = x
+        if any(t is None for t in [q, k, v]):
+            raise ValueError("Must provide either x for self-attention or q/k/v for cross-attention.")
+        bsz, q_len, dim = q.shape
+        _, k_len, _ = k.shape
+        _, v_len, _ = v.shape
+        qkv = self.c_attn(x)
+        q, k, v = torch.chunk(qkv, 3, dim=2)
+        q = q.view(bsz, q_len, self.num_heads, self.head_dim)
+        k = k.view(bsz, k_len, self.num_heads, self.head_dim)
+        v = v.view(bsz, v_len, self.num_heads, self.head_dim)
+        if self.alibi_slopes is None: # Use either ALiBi or RoPE
+            q, k = apply_rotary_emb(q, k, freqs_cis=freqs_cis)
+        y = flash_attn_func(  # https://arxiv.org/pdf/2307.08691
+            q=q, k=k, v=v,
+            dropout_p=self.dropout if self.training else 0.0,
+            causal=True,
+            window_size=(self.window_size, 0), # Set to config.seq_len if full attention
+            alibi_slopes=self.alibi_slopes, # https://arxiv.org/pdf/2108.12409
+            softcap=self.softcap,  # https://arxiv.org/pdf/2408.00118
+        )
+        y = y.contiguous().view(bsz, q_len, -1)
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class MLP(nn.Module):
+    def __init__(self, config):
+        # https://arxiv.org/pdf/2002.05202
+        super().__init__()
+        self.hidden_size = config.dim
+        self.intermediate_size = config.dim * config.mlp_scale
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        gate = self.gate_proj(x)
+        gate = F.gelu(gate, approximate="tanh")
+        up = self.up_proj(x)
+        fuse = gate * up
+        outputs = self.down_proj(fuse)
+        outputs = self.dropout(outputs)
+        return outputs
+class AttentionLayer(nn.Module):
+    def __init__(self, config) -> None:
+        super(AttentionLayer, self).__init__()
+        self.attn_norm = nn.RMSNorm(config.dim)
+        self.attn = Attention(config=config)
+        self.mlp_norm = nn.RMSNorm(config.dim)
+        self.mlp = MLP(config)
+    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor=None) -> torch.Tensor:
+        x = x + self.attn(x=self.attn_norm(x), freqs_cis=freqs_cis)
+        x = x + self.mlp(self.mlp_norm(x))
+        return x

casual_conv1d_compilable.py ADDED Viewed

	@@ -0,0 +1,214 @@

+from typing import Optional, Tuple
+import torch
+import causal_conv1d_cuda
+# Causal Conv1D Forward Function
+@torch.library.custom_op(
+    "mamba_causal_conv1d::causal_conv1d_fwd",
+    mutates_args=(),
+    device_types="cuda",
+)
+def causal_conv1d_fwd(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    seq_idx: Optional[torch.Tensor] = None,
+    activation: Optional[str] = None,
+) -> torch.Tensor:
+    # Ensure activation is valid
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    # Ensure x is contiguous
+    if x.stride(2) != 1 and x.stride(1) != 1:
+        x = x.contiguous()
+    # Make bias and seq_idx contiguous if they exist
+    bias = bias.contiguous() if bias is not None else None
+    seq_idx = seq_idx.contiguous() if seq_idx is not None else None
+    # Translate activation to bool for custom CUDA kernel
+    use_activation = activation in ["silu", "swish"]
+    # Call custom CUDA kernel for forward pass
+    out = causal_conv1d_cuda.causal_conv1d_fwd(
+        x, weight, bias, seq_idx, None, None, use_activation
+    )
+    return out
+# Register a fake forward pass for tracing
+@causal_conv1d_fwd.register_fake
+def _causal_conv1d_fwd_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    seq_idx: Optional[torch.Tensor] = None,
+    activation: Optional[str] = None,
+) -> torch.Tensor:
+    torch._check(x.shape[-2] == weight.shape[0])
+    return torch.empty_like(x)
+# Causal Conv1D Backward Function
+@torch.library.custom_op(
+    "mamba_causal_conv1d::causal_conv1d_bwd",
+    mutates_args=(),
+    device_types="cuda",
+)
+def causal_conv1d_bwd(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    dout: torch.Tensor,
+    seq_idx: Optional[torch.Tensor],
+    activation: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Ensure dout is contiguous
+    if dout.stride(2) != 1 and dout.stride(1) != 1:
+        dout = dout.contiguous()
+    # Call custom CUDA kernel for backward pass
+    dx, dweight, dbias, _ = causal_conv1d_cuda.causal_conv1d_bwd(
+        x, weight, bias, dout, seq_idx, None, None, None, False, activation
+    )
+    # Handle optional bias gradient
+    dbias = dbias if bias is not None else torch.empty((0,), device=dout.device)
+    return dx, dweight, dbias
+# Register a fake backward pass for tracing
+@causal_conv1d_bwd.register_fake
+def _causal_conv1d_bwd_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    dout: torch.Tensor,
+    seq_idx: Optional[torch.Tensor],
+    activation: bool,
+):
+    return (
+        torch.empty_like(x),
+        torch.empty_like(weight),
+        torch.empty_like(bias) if bias is not None else None,
+    )
+# Setup context for autograd
+def causal_conv1d_setup_context(ctx, inputs, output):
+    x, weight, bias, seq_idx, activation = inputs
+    ctx.activation = activation in ["silu", "swish"]
+    ctx.save_for_backward(x, weight, bias, seq_idx)
+# Bridge for backward pass in autograd
+def causal_conv1d_bwd_bridge(ctx, dout):
+    x, weight, bias, seq_idx = ctx.saved_tensors
+    dx, dweight, dbias = causal_conv1d_bwd(x, weight, bias, dout, seq_idx, ctx.activation)
+    # Handle None return values
+    dbias = dbias if bias is not None else None
+    return dx, dweight, dbias, None, None
+# Register custom autograd function
+torch.library.register_autograd(
+    "mamba_causal_conv1d::causal_conv1d_fwd",
+    causal_conv1d_bwd_bridge,
+    setup_context=causal_conv1d_setup_context,
+)
+# Define a higher-level function to invoke the custom op
+def causal_conv1d_fn(x, weight, bias=None, seq_idx=None, activation=None):
+    return causal_conv1d_fwd(x, weight, bias, seq_idx, activation)
+@torch.library.custom_op(
+    "mamba_causal_conv1d::causal_conv1d_update",
+    mutates_args=(),
+    device_types="cuda",
+)
+def causal_conv1d_update_fwd(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    activation: Optional[str] = None,
+    cache_seqlens: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    x: (batch, dim) or (batch, dim, seqlen)
+    conv_state: (batch, dim, state_len), where state_len >= width - 1
+    weight: (dim, width)
+    bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the conv_state starting at the index
+        @cache_seqlens % state_len.
+    out: (batch, dim) or (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    activation = activation in ["silu", "swish"]
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        x = x.unsqueeze(-1)
+    out = causal_conv1d_cuda.causal_conv1d_update(
+        x, conv_state, weight, bias, activation, cache_seqlens
+    )
+    if unsqueeze:
+        out = out.squeeze(-1)
+    return out
+@causal_conv1d_update_fwd.register_fake
+def _causal_conv1d_update_fwd(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    activation: Optional[str] = None,
+    cache_seqlens: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    return torch.empty_like(x)
+def causal_conv1d_update(x, conv_state, weight, bias=None, activation=None, cache_seqlens=None):
+    return causal_conv1d_update_fwd(x, conv_state, weight, bias, activation, cache_seqlens)
+# Test the implementation
+if __name__ == "__main__":
+    from causal_conv1d import causal_conv1d_fn as causal_conv1d_fn_ref
+    torch.manual_seed(0)
+    x = torch.randn(8, 32, 16, device="cuda", requires_grad=True)
+    weight = torch.randn(32, 3, device="cuda", requires_grad=True)
+    bias = None#torch.randn(32, device="cuda", requires_grad=True)
+    # Test the forward and backward pass
+    print("Custom Implementation")
+    out = causal_conv1d_fn(x, weight, bias, activation="silu")
+    out.sum().backward()
+    print(out.min(), out.max(), out.mean(), out.std())
+    print(x.grad.min(), x.grad.max(), x.grad.mean(), x.grad.std())
+    print(weight.grad.min(), weight.grad.max(), weight.grad.mean(), weight.grad.std())
+    # Try compiling the function using torch.compile
+    x.grad.zero_(), weight.grad.zero_()
+    compiled_conv1d = torch.compile(causal_conv1d_fn)
+    print(compiled_conv1d)
+    # Run the compiled function
+    print("Compiled Implementation")
+    out = compiled_conv1d(x, weight, bias, activation="silu")
+    out.sum().backward()
+    print(out.min(), out.max(), out.mean(), out.std())
+    print(x.grad.min(), x.grad.max(), x.grad.mean(), x.grad.std())
+    print(weight.grad.min(), weight.grad.max(), weight.grad.mean(), weight.grad.std())
+    print("Reference Implementation")
+    x.grad.zero_(), weight.grad.zero_()
+    out = causal_conv1d_fn_ref(x, weight, bias, activation="silu")
+    out.sum().backward()
+    print(out.min(), out.max(), out.mean(), out.std())
+    print(x.grad.min(), x.grad.max(), x.grad.mean(), x.grad.std())
+    print(weight.grad.min(), weight.grad.max(), weight.grad.mean(), weight.grad.std())

config.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "model_type": "minimamba",
+  "_name_or_path": "Mamba_546M",
+  "architectures": ["MiniMamba"],
+  "dim": 896,
+  "num_layers": 56,
+  "num_heads": 32,
+  "state_dim": 128,
+  "num_groups": 1,
+  "conv_size": 4,
+  "use_mem_eff_path": true,
+  "dt_bias": true,
+  "D_has_head_dim": true,
+  "learnable_init_states": false,
+  "ssm_chunk_size": 256,
+  "vocab_size": 200064,
+  "mlp_scale": 2,
+  "multiple_of": 256,
+  "norm_eps": 1e-5,
+  "init_use_depth": false,
+  "init_base_std": null,
+  "init_std_factor": "disabled",
+  "hidden_act": "silu",
+  "bias": false,
+  "torch_dtype": "bfloat16",
+  "seed": 1337,
+  "init_args": {
+    "dt_max": 0.1,
+    "dt_min": 0.001,
+    "dt_init_floor": 1e-4,
+    "A_init_min": 0.01,
+    "A_init_max": 16
+  },
+  "seq_len": 8192,
+  "window_size": 1024,
+  "weight_tying": true,
+  "dropout": 0.0,
+  "num_epochs": 1,
+  "global_bsz": 524288,
+  "bsz": 1,
+  "warmup_steps": 1907,
+  "eval_period": 50,
+  "save_period": 500,
+  "max_lr": 3.0e-4,
+  "min_lr": 3.0e-5,
+  "max_norm": 1.0,
+  "dilation": 1,
+  "fsdp": false,
+  "ddp": true,
+  "mixed_precision": true,
+  "cpu_offload": false,
+  "sharding_strategy": "full_shard",
+  "state_dict_type": "full",
+  "auto_wrap_policy": "partial",
+  "backward_prefetch": "backward_pre",
+  "forward_prefetch": false,
+  "sync_module_states": true,
+  "use_orig_params": true,
+  "device_id": null,
+  "precision": {
+    "param": "bfloat16",
+    "reduce": "bfloat16",
+    "buffer": "bfloat16"
+  },
+  "fsdp_modules": [
+    "MambaBlock",
+    "AttentionLayer"
+  ],
+  "use_activation_checkpointing": true,
+  "use_attn": true,
+  "use_alibi": true,
+  "softcap": 50.0,
+  "theta": 10000.0,
+  "torch_compile": false
+}

configuration_minimamba.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from transformers import PretrainedConfig
+class MiniMambaConfig(PretrainedConfig):
+    """
+    Minimal or extended config class for MiniMamba.
+    Inherits from HF's PretrainedConfig so we can do:
+      model = MiniMamba.from_pretrained(...)
+    and it will load this config automatically.
+    This config includes all fields from the provided config.json.
+    """
+    model_type = "minimamba"
+    def __init__(
+        self,
+        # Standard HF fields:
+        model_type="minimamba",
+        _name_or_path="Mamba_5460M",
+        architectures=["MiniMamba"],
+        # Key Mamba architecture hyperparameters:
+        dim=896,
+        num_layers=56,
+        num_heads=32,
+        state_dim=128,
+        num_groups=1,
+        conv_size=4,
+        use_mem_eff_path=True,
+        dt_bias=True,
+        D_has_head_dim=True,
+        learnable_init_states=False,
+        ssm_chunk_size=256,
+        vocab_size=200064,
+        mlp_scale=2,
+        ffn_dim_multiplier=2.0,
+        multiple_of=256,
+        norm_eps=1e-5,
+        init_use_depth=False,
+        init_base_std=None,
+        init_std_factor="disabled",
+        hidden_act="silu",
+        bias=False,
+        # Torch / training:
+        torch_dtype="bfloat16",
+        seed=1337,
+        # The init_config block nested in JSON:
+        init_args=None,  # e.g. dict with dt_max, dt_min, dt_init_floor, ...
+        # Additional Mamba or training fields:
+        seq_len=8192,
+        weight_tying=True,
+        dropout=0.0,
+        num_epochs=1,
+        global_bsz=524288,
+        bsz=1,
+        warmup_steps=1907,
+        eval_period=50,
+        save_period=500,
+        max_lr=0.0003,
+        min_lr=3e-5,
+        max_norm=1.0,
+        dilation=1,
+        fsdp=False,
+        ddp=True,
+        mixed_precision=True,
+        cpu_offload=False,
+        sharding_strategy="full_shard",
+        state_dict_type="full",
+        auto_wrap_policy="partial",
+        backward_prefetch="backward_pre",
+        forward_prefetch=False,
+        sync_module_states=True,
+        use_orig_params=True,
+        device_id=None,
+        precision=None,   # e.g. dict with param="bfloat16", reduce="bfloat16", buffer="bfloat16"
+        fsdp_modules=None,# e.g. ["MambaBlock"]
+        use_activation_checkpointing=True,
+        use_attn=True,
+        softcap=50.0,
+        torch_compile=True,
+        # Now accept arbitrary additional kwargs, to remain flexible:
+        **kwargs
+    ):
+        super().__init__(
+            # In HF, these common keys are typically passed to the parent:
+            model_type=model_type,
+            _name_or_path=_name_or_path,
+            architectures=architectures,
+            **kwargs
+        )
+        self.dim = dim
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.state_dim = state_dim
+        self.num_groups = num_groups
+        self.conv_size = conv_size
+        self.use_mem_eff_path = use_mem_eff_path
+        self.dt_bias = dt_bias
+        self.D_has_head_dim = D_has_head_dim
+        self.learnable_init_states = learnable_init_states
+        self.ssm_chunk_size = ssm_chunk_size
+        self.vocab_size = vocab_size
+        self.ffn_dim_multiplier = ffn_dim_multiplier
+        self.multiple_of = multiple_of
+        self.norm_eps = norm_eps
+        self.init_use_depth = init_use_depth
+        self.init_base_std = init_base_std
+        self.init_std_factor = init_std_factor
+        self.hidden_act = hidden_act
+        self.bias = bias
+        self.torch_dtype = torch_dtype
+        self.seed = seed
+        # Nested init_args (dt_max, dt_min, etc.).
+        # Could store it as a dict, or parse out the fields individually:
+        self.init_args = init_args or {}
+        self.seq_len = seq_len
+        self.weight_tying = weight_tying
+        self.dropout = dropout
+        self.num_epochs = num_epochs
+        self.global_bsz = global_bsz
+        self.bsz = bsz
+        self.warmup_steps = warmup_steps
+        self.eval_period = eval_period
+        self.save_period = save_period
+        self.max_lr = max_lr
+        self.min_lr = min_lr
+        self.max_norm = max_norm
+        self.dilation = dilation
+        self.fsdp = fsdp
+        self.ddp = ddp
+        self.mixed_precision = mixed_precision
+        self.cpu_offload = cpu_offload
+        self.sharding_strategy = sharding_strategy
+        self.state_dict_type = state_dict_type
+        self.auto_wrap_policy = auto_wrap_policy
+        self.backward_prefetch = backward_prefetch
+        self.forward_prefetch = forward_prefetch
+        self.sync_module_states = sync_module_states
+        self.use_orig_params = use_orig_params
+        self.device_id = device_id
+        self.precision = precision
+        self.fsdp_modules = fsdp_modules
+        self.use_activation_checkpointing = use_activation_checkpointing
+        self.use_attn = use_attn
+        self.softcap = softcap
+        self.torch_compile = torch_compile
+        # If you want to store any leftover kwargs:
+        self.extra_args = kwargs

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.py ADDED Viewed

	@@ -0,0 +1,788 @@

+model.py
+import  math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from enum import Enum
+from dataclasses import dataclass, field
+from causal_conv1d.causal_conv1d_varlen import causal_conv1d_varlen_states
+from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+# --- TODO: These two are always compiled even when kernel.compile is disabled. We should fix this. ---
+from causal_conv1d_compilable import causal_conv1d_fn, causal_conv1d_update
+from ssm_compilable import mamba_chunk_scan_combined
+# -----------------------------------------------------------------------------------------------------
+from .norms import build_norm
+from .attn import AttentionLayer
+from .attn import precompute_freqs_cis
+from mamba_ssm.ops.triton.ssd_combined import mamba_split_conv1d_scan_combined
+class InitStdFactor(Enum):
+    DISABLED = "disabled"            # Init std is divided by 1.0
+    GLOBAL_DEPTH = "global_depth"    # Init std is divided by sqrt(2*num_layers)
+    CURRENT_DEPTH = "current_depth"  # Init std is divided by sqrt(2*depth)
+    DIM_RATIO = "dim_ratio"          # Init std is divided by model_dim/4096
+@dataclass
+class InitConfig:
+    dt_max: float = 0.1
+    dt_min: float = 0.001
+    dt_init_floor: float = 1e-4
+    A_init_min: float = 1
+    A_init_max: float = 16
+DEFAULT_INIT_CONFIG = InitConfig()
+@dataclass
+class BaseMambaConfig:
+    """
+    Configuration for the Mamba family of models.
+    """
+    dim: int = 512
+    num_layers: int = 8
+    num_heads: int = 8
+    state_dim: int = 128
+    num_groups: int = 1
+    conv_size: int | None = 4
+    bias: bool = False      # Linear bias
+    conv_bias: bool = True  # Convolutional bias
+    dt_bias: bool = False
+    D_has_head_dim: bool = False
+    learnable_init_states: bool = False
+    mlp_scale: int = 2
+    multiple_of: int = 256  # Enforce that MLP hidden layer size is multiple of a large power of 2
+    norm_eps: float = 1e-6
+    norm_type: str = "rmsnorm"
+    # CUDA-related items
+    ssm_chunk_size: int = 256
+    use_mem_eff_path: bool = False
+    # Initialization-related items
+    init_use_depth: bool = False
+    init_base_std: float | None = None
+    init_std_factor: str = "disabled"  # e.g. "global_depth"
+    init_config: InitConfig = field(default_factory=InitConfig)
+class SSM(nn.Module):
+    """
+    State Space Model (SSM) implementation with selective state updates and convolution.
+    Implements the core SSM computation with support for both training and inference modes.
+    During inference, uses cached states for efficient token-by-token generation.
+    """
+    def __init__(self, config: BaseMambaConfig) -> None:
+        """Initialize SSM parameters and layers.
+        Args:
+            config: Configuration containing model hyperparameters
+        """
+        super().__init__()
+        self.config = config
+        vars(self).update(vars(config))
+        assert self.dim > 0,        "Model dimension (config.dim) must be positive"
+        assert self.num_heads > 0,  "Number of heads (config.num_heads) must be positive"
+        assert self.state_dim > 0,  "State dimension (config.state_dim) must be positive"
+        if self.mlp_scale is None:
+            raise ValueError(
+                "mlp_scale must be set to a valid float (e.g. 2.0) "
+                "to determine hidden_dim in SSM."
+            )
+        assert self.mlp_scale > 0, "mlp_scale must be > 0"
+        self.hidden_dim = int(self.mlp_scale * self.dim)
+        self.hidden_dim = config.multiple_of * ( # Round up to multiple_of
+            (self.hidden_dim + self.multiple_of - 1) // self.multiple_of
+        )
+        assert self.hidden_dim % self.num_heads == 0, (
+            f"Hidden dim {self.hidden_dim} not divisible by num_heads={self.num_heads}."
+        )
+        self.head_dim = self.hidden_dim // self.num_heads
+        self.dt_limit_kwargs = {}
+        dt_limit = (self.init_config.dt_min, self.init_config.dt_max)
+        if dt_limit != (0.0, float("inf")):
+            self.dt_limit_kwargs = dict(dt_limit=dt_limit)
+        # Order: [z, x, B, C, dt]
+        d_input = (
+            2 * self.hidden_dim
+            + 2 * self.num_groups * self.state_dim
+            + self.num_heads
+        )
+        self.input = nn.Linear(self.dim, d_input, bias=self.bias)
+        # Only create Conv1d if self.conv_size is specified
+        if self.conv_size is not None:
+            conv_dim = self.hidden_dim + 2 * self.num_groups * self.state_dim
+            # Depthwise-ish conv (groups = out_channels)
+            # TODO: Check that this is used if causal_conv1d_fn and causal_conv1d_update cannot be imported
+            self.conv1d = nn.Conv1d(
+                in_channels=conv_dim,
+                out_channels=conv_dim,
+                kernel_size=self.conv_size,
+                groups=conv_dim,
+                bias=self.conv_bias,  # <- This is a boolean in your config, so pass that or True/False
+                padding=self.conv_size - 1  # for "causal" style
+            )
+        if config.dt_bias:
+            self.dt_bias = nn.Parameter(torch.empty(self.num_heads))
+        else:
+            self.dt_bias = nn.Parameter(torch.zeros(self.num_heads), requires_grad=False)
+        self.A_log = nn.Parameter(torch.empty(self.num_heads))
+        if config.D_has_head_dim:
+            self.D = nn.Parameter(torch.ones(self.num_heads, self.head_dim))
+        else:
+            self.D = nn.Parameter(torch.ones(self.num_heads))
+        if self.learnable_init_states:
+            self.init_states = nn.Parameter(torch.zeros(self.num_heads, self.head_dim, self.state_dim))
+        self.norm = build_norm(config.norm_type, dim=self.hidden_dim, eps=self.norm_eps)
+        self.output = nn.Linear(self.hidden_dim, self.dim, bias=self.bias)
+    def _causal_conv(
+        self,
+        zxbcdt: torch.Tensor,
+        tok_idx: torch.Tensor | None = None,
+        cu_seqlens: torch.Tensor | None = None,
+        ssm_impl: str = "ssm"
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # TODO: Make slightly less verbose
+        """Processes input through causal convolution path, handling both full sequence and incremental cases.
+        This function implements two processing modes:
+        1. Full sequence ("ssm"): Used during training and initial prompt processing.
+        2. Incremental ("ssm_update"): Used during token-by-token generation.
+        Args:
+            zxbcdt: Input tensor containing concatenated [z, x, B, C, dt] components
+            tok_idx: Token indices for sequence processing. Required for "ssm" mode.
+                Defaults to None.
+            cu_seqlens: Cumulative sequence lengths for variable length processing.
+                Used only in "ssm" mode with caching. Defaults to None.
+            ssm_impl: Implementation mode, either "ssm" for full sequence processing
+                or "ssm_update" for incremental generation. Defaults to "ssm".
+        Returns:
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+                Tuple containing separated components (z, x, B, C, dt), where:
+                - z: Gating branch
+                - x: Main branch
+                - B, C: SSM state matrices (analogous to K, Q in attention)
+                - dt: Time delta values
+        Notes:
+            - When using "ssm" mode during inference, a cache should be pre-initialized
+            externally. This design allows for flexible caching strategies without
+            modifying model code.
+            - The "ssm_update" mode requires a cache to exist and will use it for
+            incremental state updates during generation.
+            - B, C components correspond to Key, Query in the SSM/attention duality.
+        """
+        # Split input into components
+        z, xBC, dt = torch.split(
+            zxbcdt,
+            [
+                self.hidden_dim,
+                self.hidden_dim + 2 * self.num_groups * self.state_dim,
+                self.num_heads,
+            ],
+            dim=-1,
+        )
+        if ssm_impl == "ssm":
+            if hasattr(self, "cache"):
+                conv_varlen_states = causal_conv1d_varlen_states(
+                    xBC.squeeze(0),
+                    cu_seqlens,
+                    state_len=self.cache.conv_cache.shape[-1],
+                )
+                self.cache.conv_cache.copy_(conv_varlen_states)
+            xBC = causal_conv1d_fn(
+                x=xBC.transpose(1, 2),
+                weight=self.conv1d.weight.squeeze(1),
+                bias=self.conv1d.bias,
+                activation="silu",
+                seq_idx=tok_idx,
+            ).transpose(1, 2)
+        elif ssm_impl == "ssm_update":
+            xBC = causal_conv1d_update(
+                x=xBC.squeeze(0),
+                conv_state=self.cache.conv_cache,
+                weight=self.conv1d.weight.squeeze(1),
+                bias=self.conv1d.bias,
+                activation="silu",
+            ).unsqueeze(0)
+        else:
+            raise NotImplementedError(f"SSM implementation {ssm_impl} not supported")
+        # Split processed tensor into components
+        x, B, C = torch.split(
+            xBC,
+            [
+                self.hidden_dim,
+                self.num_groups * self.state_dim,
+                self.num_groups * self.state_dim,
+            ],
+            dim=-1,
+        )
+        return z, x, B, C, dt
+    def _non_causal_conv(self, zxbcdt: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        z, x, B, C, dt = torch.split(
+            zxbcdt,
+            [
+                self.hidden_dim,
+                self.hidden_dim,
+                self.num_groups * self.state_dim,
+                self.num_groups * self.state_dim,
+                self.num_heads,
+            ],
+            dim=-1,
+        )
+        return z, x, B, C, dt
+    def _fwd(self, x, dt, A, B, C, tok_idx, cu_seqlens, initial_states):
+        """
+        For training
+        Returns:
+            (bsz, seq_len, num_heads, head_dim)
+        """
+        y = mamba_chunk_scan_combined(
+            x,
+            dt,
+            A,
+            B,
+            C,
+            dt_bias=self.dt_bias,
+            dt_softplus=True,
+            chunk_size=self.ssm_chunk_size,
+            D=self.D,
+            z=None,
+            seq_idx=tok_idx,
+            cu_seqlens=cu_seqlens,
+            initial_states=initial_states,
+            **self.dt_limit_kwargs,
+        )
+        if hasattr(self, "cache"):
+            y, varlen_states = y
+            self.cache.state_cache.copy_(varlen_states)
+        return y
+    def _step(self, x, seq_len, dt, A, B, C):
+        """
+        For inference / generation.
+        """
+        x = x.squeeze(0)
+        A = A[..., None, None].expand(self.num_heads, self.head_dim, self.state_dim)
+        dt = dt.permute(1, 2, 0).expand(seq_len, self.num_heads, self.head_dim)
+        D = self.D
+        if D is not None and D.dim() == 1:
+            D = D.unsqueeze(1).expand(self.num_heads, self.head_dim)
+        B, C = B.squeeze(0), C.squeeze(0)
+        y = selective_state_update(
+            self.cache.state_cache,
+            x,
+            dt,
+            A,
+            B,
+            C,
+            D,
+            z=None,
+            dt_bias=(
+                torch.zeros(self.num_heads, self.head_dim).to(x)
+                if self.dt_bias is None
+                else self.dt_bias.unsqueeze(1).expand(self.num_heads, self.head_dim)
+            ),
+            dt_softplus=True,
+        ).unsqueeze(0)
+        return y
+    def forward(
+        self,
+        x: torch.Tensor,
+        tok_idx: torch.Tensor | None = None,
+        cu_seqlens: torch.Tensor | None = None,
+        ssm_impl: str = "ssm",
+    ) -> torch.Tensor:
+        bsz, seq_len, _ = x.shape
+        zxbcdt = self.input(x)
+        A = -torch.exp(self.A_log.float())
+        initial_states = (
+            self.init_states.expand(bsz, -1, -1, -1)
+            if self.learnable_init_states else None
+        )
+        # Causal conv path
+        if self.conv_size is not None:
+            # Memory-efficient Triton kernel path
+            if self.use_mem_eff_path:
+                out = mamba_split_conv1d_scan_combined(
+                    zxbcdt,
+                    self.conv1d.weight.squeeze(1),
+                    self.conv1d.bias,
+                    self.dt_bias,
+                    A,
+                    D=self.D,
+                    chunk_size=self.ssm_chunk_size,
+                    seq_idx=tok_idx,
+                    activation="silu",
+                    rmsnorm_weight=self.norm.weight,
+                    rmsnorm_eps=self.norm.eps,
+                    outproj_weight=self.output.weight,
+                    outproj_bias=self.output.bias,
+                    headdim=self.head_dim,
+                    ngroups=self.num_groups,
+                    norm_before_gate=False, # Post-norm, y = self.norm(y * F.silu(z))
+                    initial_states=initial_states,
+                    **self.dt_limit_kwargs,
+                )
+                return out
+            else:
+                # CUDA kernel path
+                z, x, B, C, dt = self._causal_conv(zxbcdt)
+        else:
+            # Non-causal conv path
+            z, x, B, C, dt = self._non_causal_conv(zxbcdt)
+        x = x.view(bsz, seq_len, self.num_heads, self.head_dim)
+        B = B.view(bsz, seq_len, self.num_groups, self.state_dim)
+        C = C.view(bsz, seq_len, self.num_groups, self.state_dim)
+        # Chunked SSM scan
+        if ssm_impl == "ssm":
+            # (bsz, seq_len, num_heads, head_dim)
+            y = self._fwd(x, dt, A, B, C, tok_idx, cu_seqlens, initial_states)
+        elif ssm_impl == "ssm_update":
+            y = self._step(x, seq_len, dt, A, B, C)
+        else:
+            raise NotImplementedError(f"SSM implementation {ssm_impl} not supported")
+        y = y.view(bsz, seq_len, self.hidden_dim)
+        # Could be different activation function, including None.
+        # Mamba people post_norm here also (sometimes norm(z)*y or norm(z*y))
+        # y = self.norm(y) * F.silu(z)
+        y = self.norm(y * F.silu(z))
+        out = self.output(y)
+        return out
+    @torch.inference_mode()
+    def reset_parameters(self, init_std, factor) -> None:
+        config = self.config
+        init_config = config.init_config
+        if init_config is None:
+            init_config = DEFAULT_INIT_CONFIG
+        # Linear layers
+        in_init_std = init_std or (self.dim ** (-0.5))
+        out_init_std = init_std or (self.hidden_dim ** (-0.5))
+        out_init_std = out_init_std / factor
+        nn.init.trunc_normal_(
+            self.input.weight,
+            mean=0.0,
+            std=in_init_std,
+            a=-3 * in_init_std,
+            b=3 * in_init_std,
+        )
+        nn.init.trunc_normal_(
+            self.output.weight,
+            mean=0.0,
+            std=out_init_std,
+            a=-3 * out_init_std,
+            b=3 * out_init_std,
+        )
+        # SSM
+        if self.dt_bias is not None and self.dt_bias.requires_grad:
+            log_dt_min = math.log(init_config.dt_min)
+            log_dt_max = math.log(init_config.dt_max)
+            # Sample log_dt ~ Uniform[log_dt_min, log_dt_max]
+            log_dt = torch.rand(self.num_heads, device=self.dt_bias.device) * (log_dt_max - log_dt_min) + log_dt_min
+            dt = torch.exp(log_dt)
+            dt = torch.clamp(dt, min=init_config.dt_init_floor)
+            # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            self.dt_bias.copy_(inv_dt)
+        elif self.dt_bias is not None:
+            # If dt_bias is not trainable, we can just keep it zero or set to any constant
+            self.dt_bias.fill_(0.0)
+        # Convolution
+        if self.conv_size is not None:
+            conv_std = init_std or (self.conv_size ** (-0.5))
+            nn.init.trunc_normal_(
+                self.conv1d.weight,
+                mean=0.0,
+                std=conv_std,
+                a=-3 * conv_std,
+                b=3 * conv_std,
+            )
+            if self.conv1d.bias is not None:
+                nn.init.zeros_(self.conv1d.bias)
+        # Learnable init states
+        if self.learnable_init_states:
+            self.init_states.zero_()
+        # Initialize A_log ~ log( Uniform(A_init_min, A_init_max) )
+        self.A_log.uniform_(init_config.A_init_min, init_config.A_init_max)
+        self.A_log.log_()
+        if self.D is not None:
+            self.D.data.fill_(1.0)
+        # Reset norm parameters
+        self.norm.reset_parameters()
+class MambaBlock(nn.Module):
+    def __init__(self, config: BaseMambaConfig):
+        super().__init__()
+        self.norm = build_norm(config.norm_type, dim=config.dim, eps=config.norm_eps)
+        self.ssm = SSM(config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        tok_idx: torch.Tensor | None,
+        cu_seqlens: torch.Tensor | None,
+        ssm_impl: str = "ssm",
+    ) -> torch.Tensor:
+        x = x + self.ssm(self.norm(x), tok_idx=tok_idx, cu_seqlens=cu_seqlens, ssm_impl=ssm_impl)
+        return x
+    @torch.inference_mode()
+    def init_weights(self, init_std=None, factor=1.0):
+        self.norm.reset_parameters()
+        self.ssm.reset_parameters(init_std, factor)
+class BaseMamba(nn.Module):
+    def __init__(self, config: BaseMambaConfig):
+        super().__init__()
+        assert config.dim % config.num_heads == 0, f"dim ({self.dim}) must be divisible num_heads ({self.num_heads})"
+        self.head_dim = config.dim // config.num_heads
+        self.model_dim = config.dim
+        self.init_base_std = config.init_base_std
+        self.init_config = config.init_config
+        self.init_std_factor = InitStdFactor(config.init_std_factor)
+        # From pytorch/pytorch#123411, we set persistent=True for torch.compile and PP compatibility
+        self.register_buffer("freqs_cis", precompute_freqs_cis(
+            head_dim=self.head_dim,
+            max_seq_len=config.seq_len,
+            theta=config.theta,
+        ), persistent=True)
+        self.layers = nn.ModuleList()
+        for layer_idx in range(config.num_layers):
+            # For more complex %-split arrangements, see https://arxiv.org/pdf/2406.07887
+            if layer_idx % 2 == 0:
+                self.layers.append(MambaBlock(config))
+            else:
+                self.layers.append(
+                    AttentionLayer(config)
+                    if config.use_attn
+                    else (MambaBlock(config))
+                )
+    def _unwrap(self, layer: nn.Module) -> nn.Module:
+        """Helper function to find the underlying layer name (if wrapped in DDP or FSDP)"""
+        while hasattr(layer, "module"):
+            layer = layer.module
+        return layer
+    def forward(
+        self,
+        h: torch.Tensor,
+        tok_idx: torch.Tensor | None,
+        cu_seqlens: torch.Tensor | None,
+        ssm_impl: str = "ssm",
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            unwrapped_layer = self._unwrap(layer)
+            if isinstance(unwrapped_layer, MambaBlock):
+                h = unwrapped_layer(h, tok_idx=tok_idx, cu_seqlens=cu_seqlens, ssm_impl=ssm_impl)
+            elif isinstance(unwrapped_layer, AttentionLayer):
+                h = unwrapped_layer(h, self.freqs_cis)
+            else:
+                raise ValueError(f"ERROR: Unexpected layer type: {type(unwrapped_layer).__name__}")
+        return h
+    @torch.inference_mode()
+    def reset_parameters(self):
+        pass
+    @torch.inference_mode()
+    def init_weights(self):
+        self.reset_parameters()
+        for depth, layer in enumerate(self.layers):
+            factor = {
+                InitStdFactor.CURRENT_DEPTH: (2 * (depth + 1)) ** 0.5,
+                InitStdFactor.GLOBAL_DEPTH: (2 * (len(self.layers) + 1)) ** 0.5,
+                InitStdFactor.DIM_RATIO: self.model_dim / 4096,
+                InitStdFactor.DISABLED: 1.0,
+            }[self.init_std_factor]
+            if not hasattr(layer, "attn"): # Only initialize Mamba layers
+                layer.init_weights(self.init_base_std, factor)
+@dataclass
+class Mamba2Config(BaseMambaConfig):
+    seed: int = 1337
+    vocab_size: int = -1 # Will error if unchanged, makes you double check!
+    seq_len: int = 8192
+    window_size: int = 1024
+    weight_tying: bool = False
+    torch_dtype: torch.dtype = torch.bfloat16
+    loss_reduction: str = "mean"
+    use_attn: bool = True
+    use_alibi: bool = True
+    dropout: float = 0.0
+    softcap: float = 50.0
+    theta: float = 10000.0
+    device: torch.device = None
+    dtype: torch.dtype = torch.bfloat16
+class Mamba2(BaseMamba):
+    def __init__(self, config: Mamba2Config) -> None:
+        super().__init__(config)
+        self.weight_tying = config.weight_tying
+        self.loss_reduction = config.loss_reduction
+        assert config.vocab_size > 0, "vocab_size must be set and > 0"
+        self.tok_emb = torch.nn.Embedding(config.vocab_size, config.dim)
+        self.norm = nn.RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(
+            config.dim,
+            config.vocab_size,
+            bias=False,
+        )
+        if config.weight_tying:
+            self.output.weight = self.tok_emb.weight
+        print("Model Parameter Count: %.2fM\n" % (self._get_num_params() / 1e6,))
+    def _get_num_params(self):
+        n_params = sum(p.numel() for p in self.parameters())
+        if hasattr(self, "pos_emb") and self.pos_emb is not None:
+            n_params -= self.pos_emb.weight.numel()
+        return n_params
+    def forward(
+        self,
+        x: torch.Tensor,
+        target: torch.Tensor | None = None,
+        tok_idx: torch.Tensor | None = None,
+        cu_seqlens: torch.Tensor | None = None,
+        ssm_impl: str = "ssm",
+    ) -> torch.Tensor:
+        h = self.tok_emb(x)
+        h = super().forward(h, tok_idx=tok_idx, cu_seqlens=cu_seqlens, ssm_impl=ssm_impl)
+        logits = self.output(self.norm(h))
+        return logits
+    @torch.inference_mode()
+    def reset_parameters(self, init_std=None):
+        # Either use fixed base std or sqrt model dim
+        super().reset_parameters()
+        init_std = init_std or (self.model_dim ** (-0.5))
+        self.norm.reset_parameters()
+        nn.init.trunc_normal_(
+            self.tok_emb.weight,
+            mean=0.0,
+            std=init_std,
+            a=-3 * init_std,
+            b=3 * init_std,
+        )
+        if not self.weight_tying:
+            nn.init.trunc_normal_(
+                self.output.weight,
+                mean=0.0,
+                std=init_std,
+                a=-3 * init_std,
+                b=3 * init_std,
+            )
+    @torch.inference_mode()
+    def init_weights(self, buffer_device: torch.device = None):
+        """
+        Initialize model parameters and optionally compute buffers on a specific device.
+        Args:
+            buffer_device (torch.device, optional): If provided, any large or precomputed
+                buffers (like RoPE frequency tensors) will be allocated or re-created on
+                this device during initialization. This can avoid overhead from transferring
+                buffers between CPU and GPU after creation. If None, buffers default to the
+                device of the first parameter or CPU.
+        Usage:
+            - Pass a GPU device (e.g., ``torch.device('cuda')``) when you want to ensure
+            buffers are created directly on GPU, preventing extra transfers.
+            - Pass a CPU device (e.g., ``torch.device('cpu')``) if you want to keep
+            large buffers in CPU memory (common in CPU-offload or pipeline-parallel setups).
+            - Leave it as ``None`` to rely on the model’s existing parameter device or
+            the default PyTorch device context.
+        When / Why:
+            - Useful in distributed or pipeline-parallel training where parameters may
+            initially live on CPU, but you still need certain buffers on GPU to avoid
+            overhead during forward passes.
+            - Prevents large re-allocations or re-copies when big buffers (like RoPE
+            frequency tables) are needed per rank.
+        """
+        super().init_weights()
+    @classmethod
+    def from_model_args(cls, config: Mamba2Config) -> "Mamba2":
+        """
+        Initialize a Mamba model from a MambaConfig object.
+        Args:
+            config (MambaConfig): Mamba configuration arguments.
+        Returns:
+            Mamba: Mamba-2 model.
+        """
+        return cls(config)
+if __name__ == '__main__':
+    import json
+    config_path = "config.json"
+    with open(config_path, "r") as f:
+        config_data = json.load(f)
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    print("Device:", device)
+    torch_dtype = getattr(torch, config_data["torch_dtype"])
+    print("Torch dtype:", torch_dtype)
+    dim = config_data["dim"]
+    num_heads = config_data["num_heads"]
+    num_layers = config_data["num_layers"]
+    vocab_size = config_data["vocab_size"]
+    bias = config_data["bias"]
+    state_dim = config_data["state_dim"]
+    num_groups = config_data["num_groups"]
+    conv_size = config_data.get("conv_size")
+    use_mem_eff_path = config_data.get("use_mem_eff_path")
+    dt_bias = config_data["dt_bias"]
+    D_has_head_dim = config_data["D_has_head_dim"]
+    learnable_init_states = config_data["learnable_init_states"]
+    ssm_chunk_size = config_data["ssm_chunk_size"]
+    weight_tying = config_data["weight_tying"]
+    mlp_scale = config_data.get("mlp_scale")
+    multiple_of = config_data["multiple_of"]
+    norm_eps = config_data["norm_eps"]
+    init_use_depth = config_data["init_use_depth"]
+    init_base_std = config_data.get("init_base_std")
+    init_std_factor = config_data["init_std_factor"]
+    use_attn = config_data["use_attn"]
+    softcap = config_data["softcap"]
+    torch_compile = config_data["torch_compile"]
+    configs = Mamba2Config(
+        dim=dim,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        vocab_size=vocab_size,
+        bias=bias,
+        torch_dtype=torch_dtype,
+        state_dim=state_dim,
+        num_groups=num_groups,
+        conv_size=conv_size,
+        use_mem_eff_path=use_mem_eff_path,
+        dt_bias=dt_bias,
+        D_has_head_dim=D_has_head_dim,
+        learnable_init_states=learnable_init_states,
+        ssm_chunk_size=ssm_chunk_size,
+        weight_tying=weight_tying,
+        mlp_scale=mlp_scale,
+        multiple_of=multiple_of,
+        norm_eps=norm_eps,
+        init_use_depth=init_use_depth,
+        init_base_std=init_base_std,
+        init_std_factor=init_std_factor,
+        use_attn=use_attn,
+        softcap=softcap,
+    )
+    print("Configs:")
+    for key, value in vars(configs).items():
+        print(f"  {key}: {value}")
+    model = Mamba2(configs).to(device=device, dtype=torch_dtype)
+    x = torch.randint(
+        0, configs.vocab_size,
+        (config_data["bsz"], config_data["seq_len"]),
+        dtype=torch.long
+    ).to(device)
+    outputs = model(x)
+    print("Output shape:", outputs.shape)
+    print("Sample output:", outputs[0, 0, :10])
+    print("Mean of Mamba output: ", outputs.mean().item())
+    print("Stddev of Mamba output: ", outputs.std().item())

modeling_minimamba.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutput
+from .configuration_minimamba import MiniMambaConfig
+from .model import Mamba2, Mamba2Config
+class MiniMamba(PreTrainedModel):
+    """
+    A Hugging Face–style wrapper around a Mamba2 model, providing:
+      • forward(...) returning a CausalLMOutput
+      • support for HF training loops
+      • a naive generate(...) method with top-k/top-p sampling
+    """
+    config_class = MiniMambaConfig  # Tells HF which config class to use
+    def __init__(self, config: MiniMambaConfig) -> None:
+        """
+        Initialize the MiniMamba model, bridging Mamba2 with HF's PreTrainedModel.
+        """
+        super().__init__(config)
+        # If your config includes Mamba2-like parameters, you can build a Mamba2Config from it:
+        mamba2_args = Mamba2Config(
+            dim=config.dim,
+            num_layers=config.num_layers,
+            num_heads=config.num_heads,
+            state_dim=config.state_dim,
+            num_groups=config.num_groups,
+            conv_size=config.conv_size,
+            use_mem_eff_path=config.use_mem_eff_path,
+            dt_bias=config.dt_bias,
+            D_has_head_dim=config.D_has_head_dim,
+            learnable_init_states=config.learnable_init_states,
+            ssm_chunk_size=config.ssm_chunk_size,
+            vocab_size=config.vocab_size,
+            ffn_dim_multiplier=config.ffn_dim_multiplier,
+            multiple_of=config.multiple_of,
+            norm_eps=config.norm_eps,
+            init_use_depth=config.init_use_depth,
+            init_base_std=config.init_base_std,
+            init_std_factor=config.init_std_factor,
+            bias=config.bias,
+            softcap=config.softcap,
+            use_attn=config.use_attn,
+            # Torch / training:
+            seed=config.seed,
+            # The init_config block nested in JSON:
+            # Additional Mamba or training fields:
+            mlp_scale=config.mlp_scale,
+            weight_tying=config.weight_tying if hasattr(config, "weight_tying") else False,
+            torch_dtype=getattr(torch, config.torch_dtype) if isinstance(config.torch_dtype, str) else config.torch_dtype,
+        )
+        # Internally hold a Mamba2 model
+        self.mamba = Mamba2(config=mamba2_args)
+        # Because HF wants the final linear to be part of this top-level model,
+        # you *can* rely on Mamba2’s built-in embedding + output if you prefer.
+        # Mamba2 already has self.tok_emb and self.output.
+        # So we typically do NOT need a separate embedding or lm_head here.
+        #
+        # We only do so if we want the “HF standard” tie-weights approach:
+        #    self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
+        #    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        #    self.lm_head.weight = self.tok_emb.weight
+        #
+        # But Mamba2 does that internally if config.weight_tying == True.
+        # This is optional: store any device or dtype you might want
+        self.device_ = 'cuda' if torch.cuda.is_available() else 'cpu'
+        if isinstance(config.torch_dtype, str):
+            self.dtype_ = getattr(torch, config.torch_dtype)
+        else:
+            self.dtype_ = config.torch_dtype
+        # Parameter initialization (HF calls them with self._init_weights in some flows).
+        self.apply(self._init_weights)
+        print("MiniMamba Model Parameter Count: %.2fM\n" % (self._get_num_params() / 1e6,))
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        labels: torch.LongTensor = None,
+        **kwargs
+    ) -> CausalLMOutput:
+        """
+        Forward pass for causal language modeling.
+        Returns a CausalLMOutput that includes loss (if labels is provided) and logits.
+        """
+        # Mamba2's forward expects (x: torch.Tensor, target: torch.Tensor|None, ...)
+        # but we only need the logits from the simple call:
+        logits = self.mamba(input_ids)  # shape: [batch, seq_len, vocab_size]
+        loss = None
+        if labels is not None:
+            # By default, huggingface GPT-like models shift the logits by one
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1)
+            )
+        return CausalLMOutput(
+            loss=loss,
+            logits=logits,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        max_new_tokens: int = 50,
+        temperature: float = 0.5,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        eos_token_id: int = None,
+        pad_token_id: int = 0,
+        **kwargs
+    ):
+        """
+        A naive token-by-token generation loop (greedy + top-k/top-p + temperature).
+        """
+        # We'll accumulate new tokens in generated_ids
+        generated_ids = input_ids.clone()
+        for _ in range(max_new_tokens):
+            # Forward pass to get logits for the last token
+            outputs = self.forward(generated_ids)
+            logits = outputs.logits[:, -1, :]  # shape: (batch_size, vocab_size)
+            # Scale by temperature
+            if temperature != 1.0:
+                logits = logits / temperature
+            # Filter
+            logits = self.top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+            # Sample next token
+            probs = F.softmax(logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)  # shape: (batch, 1)
+            # Append
+            generated_ids = torch.cat([generated_ids, next_token], dim=1)
+            # If we have an EOS token, we can break early if all sequences have ended
+            if eos_token_id is not None and (next_token == eos_token_id).all():
+                break
+        return generated_ids
+    @staticmethod
+    def top_k_top_p_filtering(
+        logits: torch.Tensor,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        filter_value: float = float("-inf"),
+    ):
+        """
+        Filters logits using top-k and/or nucleus (top-p) filtering.
+        """
+        # top_k
+        if top_k > 0:
+            top_k = min(top_k, logits.size(-1))
+            indices_to_remove = logits < torch.topk(logits, top_k, dim=-1).values[:, -1, None]
+            logits[indices_to_remove] = filter_value
+        # top_p (nucleus)
+        if 0 < top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probs > top_p
+            # Shift right to keep also the first token above threshold
+            sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+            sorted_indices_to_remove[:, 0] = False
+            # Scatter to get back to original indexing
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                dim=1, index=sorted_indices, src=sorted_indices_to_remove
+            )
+            logits[indices_to_remove] = filter_value
+        return logits
+    def _init_weights(self, module):
+        """
+        HF calls _init_weights to initialize parameters.
+        If you prefer Mamba’s own init approach, you can call model.mamba.init_weights().
+        """
+        # As an example, we just call Mamba2's init routine for the entire submodel,
+        # or do some standard PyTorch inits for linear layers, embeddings, etc.
+        if isinstance(module, Mamba2):
+            module.init_weights()  # Mamba2’s internal init
+        elif isinstance(module, nn.Linear):
+            # e.g. standard xavier or normal init
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+        # If needed, do your specialized inits for other modules
+    def _get_num_params(self):
+        # Count trainable params, subtract duplicates if tying weights, etc.
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)

norms.py ADDED Viewed

	@@ -0,0 +1,358 @@

+norms.py
+"""Adapted from https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/norms.py"""
+import math
+from functools import partial
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+from torch.distributed._tensor import Partial, Replicate, Shard
+from torch.distributed._tensor.experimental import local_map
+from torch._utils import _get_available_device_type, _get_device_module
+def get_device_info():
+    device_type = _get_available_device_type()
+    if device_type is None:
+        device_type = "cuda"  # Default to CUDA
+    device_module = _get_device_module(device_type)
+    return device_type, device_module
+device_type, device_module = get_device_info()
+def build_norm(norm_type: str, dim: int, eps: float = 1e-6):
+    """
+    Builds the specified normalization layer based on the norm_type.
+    Args:
+        norm_type (str): The type of normalization layer to build.
+            Supported types: layernorm, np_layernorm, rmsnorm, fused_rmsnorm
+        dim (int): The dimension of the normalization layer.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+    Returns:
+        The built normalization layer.
+    Raises:
+        NotImplementedError: If an unknown norm_type is provided.
+    """
+    norm_type = norm_type.lower()  # Normalize to lowercase
+    if norm_type == "layernorm":
+        return nn.LayerNorm(dim, eps=eps, bias=False)
+    elif norm_type == "np_layernorm":
+        return nn.LayerNorm(dim, eps=eps, elementwise_affine=False, bias=False)
+    elif norm_type == "rmsnorm":
+        return RMSNorm(dim, eps=eps)
+    elif norm_type == "fused_rmsnorm":
+        return FusedRMSNorm(dim, eps=eps)
+    else:
+        raise NotImplementedError(f"Unknown norm_type: '{norm_type}'")
+class FusedRMSNorm(nn.Module):
+    """Fused RMS Norm, wraps a fused Triton Kernel"""
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.fused_rms_norm_fn = fused_rms_norm_fn
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """leverages Triton Fused RMS Norm kernel"""
+        return self.fused_rms_norm_fn(
+            x,
+            self.weight,
+            eps=self.eps,
+        )
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)  # type: ignore
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)  # type: ignore
+# FusedRMSNorm in Triton
+# Credit
+# Tri Dao's Triton LayerNorm: https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/triton/layer_norm.py
+# Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N"],
+)
+@triton.jit
+def _rms_norm_fwd_kernel(
+    X,
+    stride_x,
+    Y,
+    stride_y,
+    W,
+    Rstd,
+    eps,
+    M,  # num rows
+    N,  # num cols
+    block_N: tl.constexpr,
+):
+    row = tl.program_id(0)
+    cols = tl.arange(0, block_N)
+    # Load input data and weights
+    mask = cols < N
+    x = tl.load(X + row * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
+    w = tl.load(W + cols, mask=mask, other=0.0).to(tl.float32)
+    # Compute mean and variance
+    xbar = tl.where(cols < N, x, 0.0)
+    var = tl.sum(xbar * xbar, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    # Store the reciprocal standard deviation
+    tl.store(Rstd + row, rstd)
+    # Normalize and apply linear transformation
+    x_hat = x * rstd
+    y = x_hat * w
+    # Write output
+    tl.store(Y + row * stride_y + cols, y, mask=mask)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N"],
+)
+@triton.jit
+def _rms_norm_bwd_kernel_sm(
+    X,
+    stride_x,
+    W,
+    DY,
+    stride_dy,
+    DX,
+    stride_dx,
+    Rstd,
+    DW,
+    eps,
+    M,  # num rows
+    N,  # num cols
+    rows_per_program,
+    block_N: tl.constexpr,
+):
+    row_block_id = tl.program_id(0)
+    row_start = row_block_id * rows_per_program
+    cols = tl.arange(0, block_N)
+    mask = cols < N
+    # Load weights
+    w = tl.load(W + cols, mask=mask, other=0.0).to(tl.float32)
+    # Accumulate gradients for weights
+    dw = tl.zeros((block_N,), dtype=tl.float32)
+    row_end = min(row_start + rows_per_program, M)
+    for row in range(row_start, row_end):
+        # Load input, output gradient, and reciprocal standard deviation
+        x = tl.load(X + row * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
+        dy = tl.load(DY + row * stride_dy + cols, mask=mask, other=0.0).to(tl.float32)
+        rstd = tl.load(Rstd + row)
+        # Compute normalized input and gradients
+        x_hat = x * rstd
+        wdy = w * dy
+        dw += dy * x_hat
+        c1 = tl.sum(x_hat * wdy, axis=0) / N
+        dx = (wdy - x_hat * c1) * rstd
+        # Store input gradient
+        tl.store(DX + row * stride_dx + cols, dx, mask=mask)
+    # Store weight gradients
+    tl.store(DW + row_block_id * N + cols, dw, mask=mask)
+class TritonFusedRMSNorm(torch.autograd.Function):
+    @partial(
+        local_map,
+        out_placements=[Shard(1)],
+        in_placements=(None, [Shard(1)], [Replicate()], None),
+    )
+    @staticmethod
+    def forward(ctx, x, weight, eps):
+        x_shape_start = x.shape
+        # Flatten input
+        x = x.view(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
+        if weight.stride(-1) != 1:
+            weight = weight.contiguous()
+        M, N = x.shape
+        y = torch.empty_like(x)
+        rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
+        max_size = 65536 // x.element_size()
+        block_N = min(max_size, triton.next_power_of_2(N))
+        if N > block_N:
+            raise ValueError(f"N {N} must be <= {block_N=}")
+        grid = lambda meta: (M,)
+        _rms_norm_fwd_kernel[grid](
+            x,
+            x.stride(0),
+            y,
+            y.stride(0),
+            weight,
+            rstd,
+            eps,
+            M,
+            N,
+            block_N,
+        )
+        ctx.eps = eps
+        ctx.save_for_backward(x, weight, rstd)
+        ctx.x_shape_start = x_shape_start
+        y = y.reshape(x_shape_start)
+        return y
+    @partial(
+        local_map,
+        out_placements=([Shard(1)], [Partial()], None),
+        in_placements=(None, [Shard(1)]),
+    )
+    @staticmethod
+    def backward(ctx, dy):
+        x, weight, rstd = ctx.saved_tensors
+        eps = ctx.eps
+        x_shape_start = ctx.x_shape_start
+        # Flatten input and output gradients
+        dy = dy.view(-1, dy.shape[-1])
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
+        M, N = dy.shape
+        dx = torch.empty_like(x)
+        sm_count = device_module.get_device_properties(x.device).multi_processor_count
+        _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
+        max_size = 65536 // x.element_size()
+        block_N = min(max_size, triton.next_power_of_2(N))
+        rows_per_sm = math.ceil(M / sm_count)
+        if N > block_N:
+            raise ValueError(f"N {N} must be <= {block_N=}")
+        grid = lambda meta: (sm_count,)
+        _rms_norm_bwd_kernel_sm[grid](
+            x,
+            x.stride(0),
+            weight,
+            dy,
+            dy.stride(0),
+            dx,
+            dx.stride(0),
+            rstd,
+            _dw,
+            eps,
+            M,
+            N,
+            rows_per_sm,
+            block_N,
+        )
+        dw = _dw.sum(0).to(weight.dtype)
+        dx = dx.view(x_shape_start)
+        return dx, dw, None
+# expose fusedRMSNorm as a function
+def fused_rms_norm_fn(
+    x,
+    weight,
+    eps=1e-6,
+):
+    return TritonFusedRMSNorm.apply(
+        x,
+        weight,
+        eps,
+    )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

ssm_compilable.py ADDED Viewed

	@@ -0,0 +1,221 @@

+from typing import List, Optional, Tuple
+import torch
+from mamba_ssm.ops.triton.ssd_combined import _mamba_chunk_scan_combined_fwd, _mamba_chunk_scan_combined_bwd
+@torch.compile(options={"triton.cudagraphs": True}, fullgraph=True)
+def _compiled_mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, initial_states=None, seq_idx=None, cu_seqlens=None, dt_softplus=False, dt_limit=None):
+    return _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, cu_seqlens=cu_seqlens, dt_softplus=dt_softplus, dt_limit=dt_limit)
+@torch.compile(options={"triton.cudagraphs": True}, fullgraph=True)
+def _compiled_mamba_chunk_scan_combined_bwd(dout, x, dt, A, B, C, out, chunk_size, D=None, z=None, dt_bias=None, initial_states=None, dfinal_states=None, seq_idx=None, dt_softplus=False, dt_limit=None):
+    return _mamba_chunk_scan_combined_bwd(dout, x, dt, A, B, C, out, chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, dfinal_states=dfinal_states, seq_idx=seq_idx, dt_softplus=dt_softplus, dt_limit=dt_limit)
+@torch.library.custom_op(
+    "mamba_ssm::ssm_chunk_scan_combined_fwd",
+    mutates_args=(),
+    device_types="cuda",
+)
+def ssm_chunk_scan_combined_fwd(
+    x: torch.Tensor,
+    dt: torch.Tensor,
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    chunk_size: int,
+    D: Optional[torch.Tensor] = None,
+    z: Optional[torch.Tensor] = None,
+    dt_bias: Optional[torch.Tensor] = None,
+    initial_states: Optional[torch.Tensor] = None,
+    seq_idx: Optional[torch.Tensor] = None,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    dt_softplus: bool = False,
+    dt_limit: Optional[List[float]] = None
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    out, out_x, dt_out, dA_cumsum, states, final_states, *rest = _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, cu_seqlens=cu_seqlens, dt_softplus=dt_softplus, dt_limit=dt_limit)
+    return out, out_x if out_x is not None else out.new_empty(0), rest[0] if cu_seqlens is not None else out.new_empty(0)
+@ssm_chunk_scan_combined_fwd.register_fake
+def _ssm_chunk_scan_combined_fwd_fake(
+    x: torch.Tensor,
+    dt: torch.Tensor,
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    chunk_size: int,
+    D: Optional[torch.Tensor] = None,
+    z: Optional[torch.Tensor] = None,
+    dt_bias: Optional[torch.Tensor] = None,
+    initial_states: Optional[torch.Tensor] = None,
+    seq_idx: Optional[torch.Tensor] = None,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    dt_softplus: bool = False,
+    dt_limit: Optional[List[float]] = None
+):
+    _, _, n_heads, head_dim = x.shape
+    return (
+        torch.empty_like(x),
+        torch.empty_like(x) if z is not None else None,
+        x.new_empty((cu_seqlens.size(0)-1, n_heads, head_dim, B.size(0))) if cu_seqlens is not None else None,
+    )
+@torch.library.custom_op(
+    "mamba_ssm::ssm_chunk_scan_combined_bwd",
+    mutates_args=(),
+    device_types="cuda",
+)
+def ssm_chunk_scan_combined_bwd(
+    dout: torch.Tensor,
+    x: torch.Tensor,
+    dt: torch.Tensor,
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    out: torch.Tensor,
+    chunk_size: int,
+    D: Optional[torch.Tensor] = None,
+    z: Optional[torch.Tensor] = None,
+    dt_bias: Optional[torch.Tensor] = None,
+    initial_states: Optional[torch.Tensor] = None,
+    seq_idx: Optional[torch.Tensor] = None,
+    dt_softplus: bool = False,
+    dt_limit: Optional[List[float]] = None
+)-> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states = _mamba_chunk_scan_combined_bwd(dout, x, dt, A, B, C, out, chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, dfinal_states=None, seq_idx=seq_idx, dt_softplus=dt_softplus, dt_limit=dt_limit)
+    return (
+        dx,
+        ddt,
+        dA,
+        dB,
+        dC,
+        dD if dD is not None else dx.new_empty(0),
+        dz if dz is not None else dx.new_empty(0),
+        ddt_bias if ddt_bias is not None else dx.new_empty(0),
+        dinitial_states if dinitial_states is not None else dx.new_empty(0)
+    )
+@ssm_chunk_scan_combined_bwd.register_fake
+def _ssm_chunk_scan_combined_bwd_fake(
+    dout: torch.Tensor,
+    x: torch.Tensor,
+    dt: torch.Tensor,
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    out: torch.Tensor,
+    chunk_size: int,
+    D: Optional[torch.Tensor] = None,
+    z: Optional[torch.Tensor] = None,
+    dt_bias: Optional[torch.Tensor] = None,
+    initial_states: Optional[torch.Tensor] = None,
+    seq_idx: Optional[torch.Tensor] = None,
+    dt_softplus: bool = False,
+    dt_limit: Optional[List[float]] = None
+):
+    return (
+        torch.empty_like(x),
+        torch.empty_like(dt),
+        torch.empty_like(A),
+        torch.empty_like(B),
+        torch.empty_like(C),
+        torch.empty_like(D) if D is not None else None,
+        torch.empty_like(z) if z is not None else None,
+        torch.empty_like(dt_bias) if dt_bias is not None else None,
+        torch.empty_like(initial_states) if initial_states is not None else None,
+    )
+def ssm_chunk_scan_combined_setup_context(ctx, inputs, output):
+    x, dt, A, B, C, chunk_size, D, z, dt_bias, initial_states, seq_idx, cu_seqlens, dt_softplus, dt_limit = inputs
+    out, out_x, state_varlen = output
+    ctx.save_for_backward(out if z is None else out_x, x, dt, A, B, C, D, z, dt_bias, initial_states, seq_idx)
+    ctx.dt_softplus = dt_softplus
+    ctx.chunk_size = chunk_size
+    ctx.dt_limit = dt_limit
+def ssm_chunk_scan_combined_bridge(ctx, dout, dout_x, dout_state_varlen):
+    out, x, dt, A, B, C, D, z, dt_bias, initial_states, seq_idx = ctx.saved_tensors
+    dx, ddt, dA, dB, dC, dD, dz, ddt_bias, dinitial_states = ssm_chunk_scan_combined_bwd(dout, x, dt, A, B, C, out, ctx.chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, dt_softplus=ctx.dt_softplus, dt_limit=ctx.dt_limit)
+    return (
+        dx,
+        ddt,
+        dA,
+        dB,
+        dC,
+        None,
+        dD if D is not None else None,
+        dz if z is not None else None,
+        ddt_bias if dt_bias is not None else None,
+        dinitial_states if initial_states is not None else None,
+        None,
+        None,
+        None,
+        None,
+    )
+# Register custom autograd function
+torch.library.register_autograd(
+    "mamba_ssm::ssm_chunk_scan_combined_fwd",
+    ssm_chunk_scan_combined_bridge,
+    setup_context=ssm_chunk_scan_combined_setup_context,
+)
+def mamba_chunk_scan_combined(x, dt, A, B, C, chunk_size, D=None, z=None, dt_bias=None, initial_states=None, seq_idx=None, cu_seqlens=None, dt_softplus=False, dt_limit=(0.0, float("inf"))):
+    """
+    Argument:
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, seqlen, nheads)
+        A: (nheads)
+        B: (batch, seqlen, ngroups, dstate)
+        C: (batch, seqlen, ngroups, dstate)
+        chunk_size: int
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+        dt_bias: (nheads,)
+        initial_states: (batch, nheads, headdim, dstate)
+        seq_idx: (batch, seqlen)
+        cu_seqlens: (num_sequences + 1) or None
+        dt_softplus: Whether to apply softplus to dt
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+    out, _, varlen_states  = ssm_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size, D=D, z=z, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, cu_seqlens=cu_seqlens, dt_softplus=dt_softplus, dt_limit=dt_limit)
+    if cu_seqlens is not None:
+        return out, varlen_states
+    return out
+if __name__ == "__main__":
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined as mamba_chunk_scan_combined_ref
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+    x = torch.randn(2, 3, 4, 5).cuda()
+    dt = torch.randn(2, 3, 4).cuda()
+    A = torch.randn(4).cuda()
+    B = torch.randn(2, 3, 4, 5).cuda()
+    C = torch.randn(2, 3, 4, 5).cuda()
+    chunk_size = 2
+    D = torch.randn(4, 5).cuda()
+    z = torch.randn(2, 3, 4, 5).cuda()
+    dt_bias = torch.randn(4).cuda()
+    out = mamba_chunk_scan_combined(x, dt, A, B, C, chunk_size, D=D, z=z, dt_bias=dt_bias)
+    print(out.min(), out.max(), out.mean(), out.std())
+    compiled_mamba_chunk_scan_combined = torch.compile(mamba_chunk_scan_combined)
+    out = compiled_mamba_chunk_scan_combined(x, dt, A, B, C, chunk_size, D=D, z=z, dt_bias=dt_bias)
+    print(out.min(), out.max(), out.mean(), out.std())
+    out_ref = mamba_chunk_scan_combined_ref(x, dt, A, B, C, chunk_size, D=D, z=z, dt_bias=dt_bias)
+    print(out_ref.min(), out_ref.max(), out_ref.mean(), out_ref.std())

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "199999": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 128000,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff