fix typo

Browse files

Files changed (9) hide show

__init__.py +1 -1
__pycache__/__init__.cpython-312.pyc +0 -0
__pycache__/causal_conv1d_compilable.cpython-312.pyc +0 -0
__pycache__/configuration_minimamba.cpython-312.pyc +0 -0
__pycache__/model.cpython-312.pyc +0 -0
__pycache__/modeling_minimamba.cpython-312.pyc +0 -0
__pycache__/norms.cpython-312.pyc +0 -0
__pycache__/ssm_compilable.cpython-312.pyc +0 -0
modeling_minimamba.py +117 -1011

__init__.py CHANGED Viewed

	@@ -1,2 +1,2 @@
1	from .configuration_minimamba import MiniMambaConfig
2	- from .modeling_minimamba import ~~Mamba2~~


1	from .configuration_minimamba import MiniMambaConfig
2	+ from .modeling_minimamba import MiniMamba

__pycache__/__init__.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/__init__.cpython-312.pyc and b/__pycache__/__init__.cpython-312.pyc differ

__pycache__/causal_conv1d_compilable.cpython-312.pyc ADDED Viewed

Binary file (10.7 kB). View file

__pycache__/configuration_minimamba.cpython-312.pyc ADDED Viewed

Binary file (3.85 kB). View file

__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (39 kB). View file

__pycache__/modeling_minimamba.cpython-312.pyc ADDED Viewed

Binary file (7.94 kB). View file

__pycache__/norms.cpython-312.pyc ADDED Viewed

Binary file (14.6 kB). View file

__pycache__/ssm_compilable.cpython-312.pyc ADDED Viewed

Binary file (12.3 kB). View file

modeling_minimamba.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -6,118 +7,84 @@ from transformers import PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutput
 from .configuration_minimamba import MiniMambaConfig
-from enum import Enum
-from dataclasses import dataclass, field
-from causal_conv1d.causal_conv1d_varlen import causal_conv1d_varlen_states
-from mamba_ssm.ops.triton.selective_state_update import selective_state_update
-from mamba_ssm.ops.triton.ssd_combined import mamba_split_conv1d_scan_combined
-from .causal_conv1d_compilable import causal_conv1d_fn, causal_conv1d_update
-from .ssm_compilable import mamba_chunk_scan_combined
-from .norms import build_norm
-class InitStdFactor(Enum):
-    DISABLED = "disabled"            # Init std is divided by 1.0
-    GLOBAL_DEPTH = "global_depth"    # Init std is divided by sqrt(2*num_layers)
-    CURRENT_DEPTH = "current_depth"  # Init std is divided by sqrt(2*depth)
-    DIM_RATIO = "dim_ratio"          # Init std is divided by model_dim/4096
-@dataclass
-class InitConfig:
-    dt_max: float = 0.1
-    dt_min: float = 0.001
-    dt_init_floor: float = 1e-4
-    A_init_min: float = 1
-    A_init_max: float = 16
-DEFAULT_INIT_CONFIG = InitConfig()
-class MiniSTU(PreTrainedModel):
-    config_class = MiniSTUConfig
-    def __init__(self, config) -> None:
-        super(MiniSTU, self).__init__(config)
-        self.n_layers = config.n_layers
-        self.n = nearest_power_of_two(config.seq_len * 2 - 1, round_up=True)
-        if isinstance(config.torch_dtype, torch.dtype):
-            torch_dtype = config.torch_dtype
         else:
-            torch_dtype = getattr(torch, config.torch_dtype)
-        device = torch.device(config.device)
-        self.phi = get_spectral_filters(
-            config.seq_len,
-            config.num_eigh,
-            config.use_hankel_L,
-            device=device,
-            dtype=torch_dtype,
-        )
-        self.use_approx = config.use_approx
-        self.use_hankel_L = config.use_hankel_L
-        self.tok_emb = nn.Embedding(
-            config.vocab_size, config.n_embd, dtype=torch_dtype, device=device
-        )
-        self.dropout = nn.Dropout(config.dropout)
-        self.layers = nn.ModuleList()
-        for layer_idx in range(self.n_layers):
-            if layer_idx % 2 == 0:
-                self.layers.append(STULayer(config, self.phi, self.n))
-            else:
-                self.layers.append(
-                    AttentionLayer(config)
-                    if config.use_attn
-                    else STULayer(config, self.phi, self.n)
-                )
-        self.norm = TritonNorm(config.n_embd) if triton_norm else RMSNorm(config.n_embd)
-        self.lm_head = nn.Linear(
-            config.n_embd, config.vocab_size, bias=config.bias, dtype=torch_dtype, device=device
-        )
-        self.tok_emb.weight = self.lm_head.weight
-        self.std = (config.n_embd) ** -0.5
         self.apply(self._init_weights)
-        print("Model Parameter Count: %.2fM\n" % (self._get_num_params() / 1e6,))
     def forward(
         self,
-        input_ids: torch.Tensor,
-        labels: torch.Tensor = None,
         **kwargs
     ) -> CausalLMOutput:
-        # Compute embeddings
-        tok_emb = self.tok_emb(input_ids)
-        x = self.dropout(tok_emb)
-        # Pass through layers
-        for layer in self.layers:
-            x = layer(x)
-        # Normalize and project to vocabulary
-        x = self.norm(x)
-        logits = self.lm_head(x)
         loss = None
         if labels is not None:
-            # Shift so that tokens predict the next token
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             loss_fct = nn.CrossEntropyLoss()
             loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)),
                 shift_labels.view(-1)
             )
@@ -126,73 +93,7 @@ class MiniSTU(PreTrainedModel):
             logits=logits,
         )
-    def _get_num_params(self):
-        n_params = sum(p.numel() for p in self.parameters())
-        if hasattr(self, "pos_emb") and self.pos_emb is not None:
-            n_params -= self.pos_emb.weight.numel()
-        if self.tok_emb.weight is not self.lm_head.weight:
-            n_params -= self.tok_emb.weight.numel()
-        return n_params
-    def _init_weights(self, module):
-        if isinstance(module, nn.Linear):
-            if hasattr(module, "SCALE_INIT"):
-                self.std *= (2 * self.n_layers) ** -0.5
-            torch.nn.init.normal_(module.weight, mean=0.0, std=self.std)
-            if module.bias is not None:
-                torch.nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.Embedding):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=self.std)
-        elif isinstance(module, STU):
-            if self.use_approx:
-                torch.nn.init.xavier_normal_(module.M_inputs)
-                torch.nn.init.xavier_normal_(module.M_filters)
-            else:
-                torch.nn.init.xavier_normal_(module.M_phi_plus)
-                if not self.use_hankel_L:
-                    torch.nn.init.xavier_normal_(module.M_phi_minus)
-        elif isinstance(module, Attention):
-            torch.nn.init.xavier_normal_(module.c_attn.weight)
-            torch.nn.init.xavier_normal_(module.c_proj.weight)
-            if module.c_attn.bias is not None:
-                torch.nn.init.zeros_(module.c_attn.bias)
-            if module.c_proj.bias is not None:
-                torch.nn.init.zeros_(module.c_proj.bias)
-    @staticmethod
-    def top_k_top_p_filtering(
-        logits: torch.Tensor,
-        top_k: int = 50,
-        top_p: float = 0.95,
-        filter_value: float = float("-inf"),
-    ):
-        """
-        Filters a distribution of logits using top-k and/or nucleus (top-p) filtering.
-        """
-        # top_k
-        if top_k > 0:
-            top_k = min(top_k, logits.size(-1))
-            # Remove all logits that are not in the top k
-            indices_to_remove = logits < torch.topk(logits, top_k, dim=-1).values[:, -1, None]
-            logits[indices_to_remove] = filter_value
-        # top_p (nucleus)
-        if 0 < top_p < 1.0:
-            sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
-            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-            # Remove tokens with cumulative probability above the threshold
-            sorted_indices_to_remove = cumulative_probs > top_p
-            # Shift the indices to the right to keep also the first token above the threshold
-            sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
-            sorted_indices_to_remove[:, 0] = False
-            indices_to_remove = sorted_indices_to_remove.scatter(
-                dim=1, index=sorted_indices, src=sorted_indices_to_remove
-            )
-            logits[indices_to_remove] = filter_value
-        return logits
     def generate(
         self,
         input_ids: torch.LongTensor,
@@ -205,27 +106,9 @@ class MiniSTU(PreTrainedModel):
         **kwargs
     ):
         """
-        Naive token-by-token generation loop that uses top-k/top-p filtering and optional temperature.
-        Args:
-            input_ids (torch.LongTensor): shape (batch_size, sequence_length).
-            max_new_tokens (int): max number of tokens to generate (beyond input_ids length).
-            temperature (float): sampling temperature (>=0).
-            top_k (int): Top-K sampling cutoff.
-            top_p (float): Nucleus sampling cutoff.
-            eos_token_id (int): If set, stop generation when this token is produced.
-            pad_token_id (int): If set, can be used to pad sequences. (Not fully used here.)
-            kwargs: Unused arguments (like num_beams) for compatibility.
-        Returns:
-            torch.LongTensor: shape (batch_size, sequence_length + generated_tokens).
         """
-        device = input_ids.device
-        print("1=====================")
-        print(tokenizer.decode(input_ids[0], skip_special_tokens=True))
-        print("1=====================")
-        # We'll accumulate new tokens into generated_ids
         generated_ids = input_ids.clone()
         for _ in range(max_new_tokens):
@@ -233,857 +116,80 @@ class MiniSTU(PreTrainedModel):
             outputs = self.forward(generated_ids)
             logits = outputs.logits[:, -1, :]  # shape: (batch_size, vocab_size)
-            # Scale logits by temperature
             if temperature != 1.0:
                 logits = logits / temperature
-            # Filter logits using top-k and/or top-p
             logits = self.top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
-            # Convert to probabilities
-            probabilities = F.softmax(logits, dim=-1)
-            # Sample from the distribution
-            next_token = torch.multinomial(probabilities, num_samples=1)  # (batch_size, 1)
-            # Append next token
             generated_ids = torch.cat([generated_ids, next_token], dim=1)
-            # If eos_token_id is set and any sample produced it, we optionally could break early
-            if eos_token_id is not None:
-                # Check if all sequences in the batch ended
-                # or if you want to do a more fine-grained approach
-                if (next_token == eos_token_id).all():
-                    break
-        print("2=====================")
-        print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
-        print("2=====================")
-        return generated_ids
-@dataclass
-class BaseMambaConfig:
-    """
-    Configuration for the Mamba family of models.
-    """
-    dim: int = 512
-    num_layers: int = 8
-    num_heads: int = 8
-    state_dim: int = 128
-    num_groups: int = 1
-    conv_size: int | None = 4
-    bias: bool = False      # Linear bias
-    conv_bias: bool = True  # Convolutional bias
-    dt_bias: bool = False
-    D_has_head_dim: bool = False
-    learnable_init_states: bool = False
-    ffn_dim_multiplier: float = 2.0
-    multiple_of: int = 256  # Enforce that MLP hidden layer size is multiple of a large power of 2
-    norm_eps: float = 1e-6
-    norm_type: str = "rmsnorm"
-    # CUDA-related items
-    ssm_chunk_size: int = 256
-    use_mem_eff_path: bool = False
-    # Initialization-related items
-    init_use_depth: bool = False
-    init_base_std: float | None = None
-    init_std_factor: str = "disabled"  # e.g. "global_depth"
-    init_config: InitConfig = field(default_factory=InitConfig)
-class SSM(nn.Module):
-    """
-    State Space Model (SSM) implementation with selective state updates and convolution.
-    Implements the core SSM computation with support for both training and inference modes.
-    During inference, uses cached states for efficient token-by-token generation.
-    """
-    def __init__(self, config: BaseMambaConfig) -> None:
-        """Initialize SSM parameters and layers.
-        Args:
-            config: Configuration containing model hyperparameters
-        """
-        super().__init__()
-        self.config = config
-        vars(self).update(vars(config))
-        assert self.dim > 0,        "Model dimension (config.dim) must be positive"
-        assert self.num_heads > 0,  "Number of heads (config.num_heads) must be positive"
-        assert self.state_dim > 0,  "State dimension (config.state_dim) must be positive"
-        if self.ffn_dim_multiplier is None:
-            raise ValueError(
-                "ffn_dim_multiplier must be set to a valid float (e.g. 2.0) "
-                "to determine hidden_dim in SSM."
-            )
-        assert self.ffn_dim_multiplier > 0, "ffn_dim_multiplier must be > 0"
-        self.hidden_dim = int(self.ffn_dim_multiplier * self.dim)
-        self.hidden_dim = config.multiple_of * ( # Round up to multiple_of
-            (self.hidden_dim + self.multiple_of - 1) // self.multiple_of
-        )
-        assert self.hidden_dim % self.num_heads == 0, (
-            f"Hidden dim {self.hidden_dim} not divisible by num_heads={self.num_heads}."
-        )
-        self.head_dim = self.hidden_dim // self.num_heads
-        self.dt_limit_kwargs = {}
-        dt_limit = (self.init_config.dt_min, self.init_config.dt_max)
-        if dt_limit != (0.0, float("inf")):
-            self.dt_limit_kwargs = dict(dt_limit=dt_limit)
-        # Order: [z, x, B, C, dt]
-        d_input = (
-            2 * self.hidden_dim
-            + 2 * self.num_groups * self.state_dim
-            + self.num_heads
-        )
-        self.input = nn.Linear(self.dim, d_input, bias=self.bias)
-        # Only create Conv1d if self.conv_size is specified
-        if self.conv_size is not None:
-            conv_dim = self.hidden_dim + 2 * self.num_groups * self.state_dim
-            # Depthwise-ish conv (groups = out_channels)
-            # TODO: Check that this is used if causal_conv1d_fn and causal_conv1d_update cannot be imported
-            self.conv1d = nn.Conv1d(
-                in_channels=conv_dim,
-                out_channels=conv_dim,
-                kernel_size=self.conv_size,
-                groups=conv_dim,
-                bias=self.conv_bias,  # <- This is a boolean in your config, so pass that or True/False
-                padding=self.conv_size - 1  # for "causal" style
-            )
-        if config.dt_bias:
-            self.dt_bias = nn.Parameter(torch.empty(self.num_heads))
-        else:
-            self.dt_bias = nn.Parameter(torch.zeros(self.num_heads), requires_grad=False)
-        self.A_log = nn.Parameter(torch.empty(self.num_heads))
-        if config.D_has_head_dim:
-            self.D = nn.Parameter(torch.ones(self.num_heads, self.head_dim))
-        else:
-            self.D = nn.Parameter(torch.ones(self.num_heads))
-        if self.learnable_init_states:
-            self.init_states = nn.Parameter(torch.zeros(self.num_heads, self.head_dim, self.state_dim))
-        # Can also just use nn.RMSNorm
-        self.norm = build_norm(config.norm_type, dim=self.hidden_dim, eps=self.norm_eps)
-        self.output = nn.Linear(self.hidden_dim, self.dim, bias=self.bias)
-    def _causal_conv(
-        self,
-        zxbcdt: torch.Tensor,
-        tok_idx: torch.Tensor | None = None,
-        cu_seqlens: torch.Tensor | None = None,
-        ssm_impl: str = "ssm"
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # TODO: Make slightly less verbose
-        """Processes input through causal convolution path, handling both full sequence and incremental cases.
-        This function implements two processing modes:
-        1. Full sequence ("ssm"): Used during training and initial prompt processing.
-        2. Incremental ("ssm_update"): Used during token-by-token generation.
-        Args:
-            zxbcdt: Input tensor containing concatenated [z, x, B, C, dt] components
-            tok_idx: Token indices for sequence processing. Required for "ssm" mode.
-                Defaults to None.
-            cu_seqlens: Cumulative sequence lengths for variable length processing.
-                Used only in "ssm" mode with caching. Defaults to None.
-            ssm_impl: Implementation mode, either "ssm" for full sequence processing
-                or "ssm_update" for incremental generation. Defaults to "ssm".
-        Returns:
-            tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-                Tuple containing separated components (z, x, B, C, dt), where:
-                - z: Gating branch
-                - x: Main branch
-                - B, C: SSM state matrices (analogous to K, Q in attention)
-                - dt: Time delta values
-        Notes:
-            - When using "ssm" mode during inference, a cache should be pre-initialized
-            externally. This design allows for flexible caching strategies without
-            modifying model code.
-            - The "ssm_update" mode requires a cache to exist and will use it for
-            incremental state updates during generation.
-            - B, C components correspond to Key, Query in the SSM/attention duality.
-        """
-        # Split input into components
-        z, xBC, dt = torch.split(
-            zxbcdt,
-            [
-                self.hidden_dim,
-                self.hidden_dim + 2 * self.num_groups * self.state_dim,
-                self.num_heads,
-            ],
-            dim=-1,
-        )
-        if ssm_impl == "ssm":
-            if hasattr(self, "cache"):
-                conv_varlen_states = causal_conv1d_varlen_states(
-                    xBC.squeeze(0),
-                    cu_seqlens,
-                    state_len=self.cache.conv_cache.shape[-1],
-                )
-                self.cache.conv_cache.copy_(conv_varlen_states)
-            xBC = causal_conv1d_fn(
-                x=xBC.transpose(1, 2),
-                weight=self.conv1d.weight.squeeze(1),
-                bias=self.conv1d.bias,
-                activation="silu",
-                seq_idx=tok_idx,
-            ).transpose(1, 2)
-        elif ssm_impl == "ssm_update":
-            xBC = causal_conv1d_update(
-                x=xBC.squeeze(0),
-                conv_state=self.cache.conv_cache,
-                weight=self.conv1d.weight.squeeze(1),
-                bias=self.conv1d.bias,
-                activation="silu",
-            ).unsqueeze(0)
-        else:
-            raise NotImplementedError(f"SSM implementation {ssm_impl} not supported")
-        # Split processed tensor into components
-        x, B, C = torch.split(
-            xBC,
-            [
-                self.hidden_dim,
-                self.num_groups * self.state_dim,
-                self.num_groups * self.state_dim,
-            ],
-            dim=-1,
-        )
-        return z, x, B, C, dt
-    def _non_causal_conv(self, zxbcdt: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        z, x, B, C, dt = torch.split(
-            zxbcdt,
-            [
-                self.hidden_dim,
-                self.hidden_dim,
-                self.num_groups * self.state_dim,
-                self.num_groups * self.state_dim,
-                self.num_heads,
-            ],
-            dim=-1,
-        )
-        return z, x, B, C, dt
-    def _fwd(self, x, dt, A, B, C, tok_idx, cu_seqlens, initial_states):
-        """
-        For training
-        Returns:
-            (bsz, seq_len, num_heads, head_dim)
-        """
-        y = mamba_chunk_scan_combined(
-            x,
-            dt,
-            A,
-            B,
-            C,
-            dt_bias=self.dt_bias,
-            dt_softplus=True,
-            chunk_size=self.ssm_chunk_size,
-            D=self.D,
-            z=None,
-            seq_idx=tok_idx,
-            cu_seqlens=cu_seqlens,
-            initial_states=initial_states,
-            **self.dt_limit_kwargs,
-        )
-        if hasattr(self, "cache"):
-            y, varlen_states = y
-            self.cache.state_cache.copy_(varlen_states)
-        return y
-    def _step(self, x, seq_len, dt, A, B, C):
         """
-        For inference / generation.
         """
-        x = x.squeeze(0)
-        A = A[..., None, None].expand(self.num_heads, self.head_dim, self.state_dim)
-        dt = dt.permute(1, 2, 0).expand(seq_len, self.num_heads, self.head_dim)
-        D = self.D
-        if D is not None and D.dim() == 1:
-            D = D.unsqueeze(1).expand(self.num_heads, self.head_dim)
-        B, C = B.squeeze(0), C.squeeze(0)
-        y = selective_state_update(
-            self.cache.state_cache,
-            x,
-            dt,
-            A,
-            B,
-            C,
-            D,
-            z=None,
-            dt_bias=(
-                torch.zeros(self.num_heads, self.head_dim).to(x)
-                if self.dt_bias is None
-                else self.dt_bias.unsqueeze(1).expand(self.num_heads, self.head_dim)
-            ),
-            dt_softplus=True,
-        ).unsqueeze(0)
-        return y
-    def forward(
-        self,
-        x: torch.Tensor,
-        tok_idx: torch.Tensor | None = None,
-        cu_seqlens: torch.Tensor | None = None,
-        ssm_impl: str = "ssm",
-    ) -> torch.Tensor:
-        bsz, seq_len, _ = x.shape
-        zxbcdt = self.input(x)
-        A = -torch.exp(self.A_log.float())
-        initial_states = (
-            self.init_states.expand(bsz, -1, -1, -1)
-            if self.learnable_init_states else None
-        )
-        # Causal conv path
-        if self.conv_size is not None:
-            # Memory-efficient Triton kernel path
-            if self.use_mem_eff_path:
-                out = mamba_split_conv1d_scan_combined(
-                    zxbcdt,
-                    self.conv1d.weight.squeeze(1),
-                    self.conv1d.bias,
-                    self.dt_bias,
-                    A,
-                    D=self.D,
-                    chunk_size=self.ssm_chunk_size,
-                    seq_idx=tok_idx,
-                    activation="silu",
-                    rmsnorm_weight=self.norm.weight,
-                    rmsnorm_eps=self.norm.eps,
-                    outproj_weight=self.output.weight,
-                    outproj_bias=self.output.bias,
-                    headdim=self.head_dim,
-                    ngroups=self.num_groups,
-                    norm_before_gate=False, # Post-norm, y = self.norm(y * F.silu(z))
-                    initial_states=initial_states,
-                    **self.dt_limit_kwargs,
-                )
-                return out
-            else:
-                # CUDA kernel path
-                z, x, B, C, dt = self._causal_conv(zxbcdt)
-        else:
-            # Non-causal conv path
-            z, x, B, C, dt = self._non_causal_conv(zxbcdt)
-        x = x.view(bsz, seq_len, self.num_heads, self.head_dim)
-        B = B.view(bsz, seq_len, self.num_groups, self.state_dim)
-        C = C.view(bsz, seq_len, self.num_groups, self.state_dim)
-        # Chunked SSM scan
-        if ssm_impl == "ssm":
-            # (bsz, seq_len, num_heads, head_dim)
-            y = self._fwd(x, dt, A, B, C, tok_idx, cu_seqlens, initial_states)
-        elif ssm_impl == "ssm_update":
-            y = self._step(x, seq_len, dt, A, B, C)
-        else:
-            raise NotImplementedError(f"SSM implementation {ssm_impl} not supported")
-        y = y.view(bsz, seq_len, self.hidden_dim)
-        # Could be different activation function, including None.
-        # Mamba people post_norm here also (sometimes norm(z)*y or norm(z*y))
-        # y = self.norm(y) * F.silu(z)
-        y = self.norm(y * F.silu(z))
-        out = self.output(y)
-        return out
-    @torch.inference_mode()
-    def reset_parameters(self, init_std, factor) -> None:
-        config = self.config
-        init_config = config.init_config
-        if init_config is None:
-            init_config = DEFAULT_INIT_CONFIG
-        # Linear layers
-        in_init_std = init_std or (self.dim ** (-0.5))
-        out_init_std = init_std or (self.hidden_dim ** (-0.5))
-        out_init_std = out_init_std / factor
-        nn.init.trunc_normal_(
-            self.input.weight,
-            mean=0.0,
-            std=in_init_std,
-            a=-3 * in_init_std,
-            b=3 * in_init_std,
-        )
-        nn.init.trunc_normal_(
-            self.output.weight,
-            mean=0.0,
-            std=out_init_std,
-            a=-3 * out_init_std,
-            b=3 * out_init_std,
-        )
-        # SSM
-        if self.dt_bias is not None and self.dt_bias.requires_grad:
-            log_dt_min = math.log(init_config.dt_min)
-            log_dt_max = math.log(init_config.dt_max)
-            # Sample log_dt ~ Uniform[log_dt_min, log_dt_max]
-            log_dt = torch.rand(self.num_heads, device=self.dt_bias.device) * (log_dt_max - log_dt_min) + log_dt_min
-            dt = torch.exp(log_dt)
-            dt = torch.clamp(dt, min=init_config.dt_init_floor)
-            # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-            inv_dt = dt + torch.log(-torch.expm1(-dt))
-            self.dt_bias.copy_(inv_dt)
-        elif self.dt_bias is not None:
-            # If dt_bias is not trainable, we can just keep it zero or set to any constant
-            self.dt_bias.fill_(0.0)
-        # Convolution
-        if self.conv_size is not None:
-            conv_std = init_std or (self.conv_size ** (-0.5))
-            nn.init.trunc_normal_(
-                self.conv1d.weight,
-                mean=0.0,
-                std=conv_std,
-                a=-3 * conv_std,
-                b=3 * conv_std,
-            )
-            if self.conv1d.bias is not None:
-                nn.init.zeros_(self.conv1d.bias)
-        # Learnable init states
-        if self.learnable_init_states:
-            self.init_states.zero_()
-        # Initialize A_log ~ log( Uniform(A_init_min, A_init_max) )
-        self.A_log.uniform_(init_config.A_init_min, init_config.A_init_max)
-        self.A_log.log_()
-        if self.D is not None:
-            self.D.data.fill_(1.0)
-        # Reset norm parameters
-        self.norm.reset_parameters()
-class MambaBlock(nn.Module):
-    def __init__(self, config: BaseMambaConfig):
-        super().__init__()
-        self.norm = build_norm(config.norm_type, dim=config.dim, eps=config.norm_eps)
-        self.ssm = SSM(config)
-    def forward(
-        self,
-        x: torch.Tensor,
-        tok_idx: torch.Tensor | None,
-        cu_seqlens: torch.Tensor | None,
-        ssm_impl: str = "ssm",
-    ) -> torch.Tensor:
-        x = x + self.ssm(self.norm(x), tok_idx=tok_idx, cu_seqlens=cu_seqlens, ssm_impl=ssm_impl)
-        return x
-    @torch.inference_mode()
-    def init_weights(self, init_std=None, factor=1.0):
-        self.norm.reset_parameters()
-        self.ssm.reset_parameters(init_std, factor)
-class BaseMamba(nn.Module):
-    def __init__(self, config: BaseMambaConfig):
-        super().__init__()
-        self.model_dim = config.dim
-        self.init_base_std = config.init_base_std
-        self.init_config = config.init_config
-        self.init_std_factor = InitStdFactor(config.init_std_factor)
-        self.layers = nn.ModuleList()
-        for _ in range(config.num_layers):
-            self.layers.append(MambaBlock(config))
-    def forward(
-        self,
-        h: torch.Tensor,
-        tok_idx: torch.Tensor | None,
-        cu_seqlens: torch.Tensor | None,
-        ssm_impl: str = "ssm",
-    ) -> torch.Tensor:
-        for layer in self.layers:
-            h = layer(h, tok_idx=tok_idx, cu_seqlens=cu_seqlens, ssm_impl=ssm_impl)
-        return h
-    @torch.inference_mode()
-    def reset_parameters(self):
-        pass
-    @torch.inference_mode()
-    def init_weights(self):
-        self.reset_parameters()
-        for depth, layer in enumerate(self.layers):
-            factor = {
-                InitStdFactor.CURRENT_DEPTH: (2 * (depth + 1)) ** 0.5,
-                InitStdFactor.GLOBAL_DEPTH: (2 * (len(self.layers) + 1)) ** 0.5,
-                InitStdFactor.DIM_RATIO: self.model_dim / 4096,
-                InitStdFactor.DISABLED: 1.0,
-            }[self.init_std_factor]
-            layer.init_weights(self.init_base_std, factor)
-@dataclass
-class Mamba2Config(BaseMambaConfig):
-    seed: int = 1337
-    vocab_size: int = -1 # Will error if unchanged, makes you double check!
-    weight_tying: bool = False
-    torch_dtype: torch.dtype = torch.bfloat16
-    loss_reduction: str = "mean"
-    use_attn: bool = False
-    softcap: float = 50.0
-class Mamba2(BaseMamba):
-    def __init__(self, config: Mamba2Config) -> None:
-        super().__init__(config)
-        if isinstance(config.torch_dtype, torch.dtype):
-            torch_dtype = config.torch_dtype
-        else:
-            torch_dtype = getattr(torch, config.torch_dtype)
-        self.weight_tying = config.weight_tying
-        self.loss_reduction = config.loss_reduction
-        assert config.vocab_size > 0, "vocab_size must be set and > 0"
-        self.tok_emb = torch.nn.Embedding(config.vocab_size, config.dim)
-        self.norm = nn.RMSNorm(config.dim, eps=config.norm_eps)
-        self.output = nn.Linear(
-            config.dim,
-            config.vocab_size,
-            bias=False,
-        )
-        if config.weight_tying:
-            self.output.weight = self.tok_emb.weight
-        print("Model Parameter Count: %.2fM\n" % (self._get_num_params() / 1e6,))
-    def _get_num_params(self):
-        n_params = sum(p.numel() for p in self.parameters())
-        if hasattr(self, "pos_emb") and self.pos_emb is not None:
-            n_params -= self.pos_emb.weight.numel()
-        if self.tok_emb.weight is not self.output.weight:
-            n_params -= self.tok_emb.weight.numel()
-        return n_params
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        target: torch.Tensor | None = None,
-        tok_idx: torch.Tensor | None = None,
-        cu_seqlens: torch.Tensor | None = None,
-        ssm_impl: str = "ssm",
-        labels: torch.Tensor = None,
-        **kwargs
-    ) -> CausalLMOutput:
-        h = self.tok_emb(input_ids)
-        h = super().forward(h, tok_idx=tok_idx, cu_seqlens=cu_seqlens, ssm_impl=ssm_impl)
-        logits = self.output(self.norm(h))
-        loss = None
-        if labels is not None:
-            # By default, huggingface GPT-like models shift the logits by one
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)),
-                shift_labels.view(-1)
-            )
-        return CausalLMOutput(
-            loss=loss,
-            logits=logits,
-        )
-    @torch.inference_mode()
-    def reset_parameters(self, init_std=None):
-        # Either use fixed base std or sqrt model dim
-        super().reset_parameters()
-        init_std = init_std or (self.model_dim ** (-0.5))
-        self.norm.reset_parameters()
-        nn.init.trunc_normal_(
-            self.tok_emb.weight,
-            mean=0.0,
-            std=init_std,
-            a=-3 * init_std,
-            b=3 * init_std,
-        )
-        if not self.weight_tying:
-            nn.init.trunc_normal_(
-                self.output.weight,
-                mean=0.0,
-                std=init_std,
-                a=-3 * init_std,
-                b=3 * init_std,
             )
-    @torch.inference_mode()
-    def init_weights(self, buffer_device: torch.device = None):
-        """
-        Initialize model parameters and optionally compute buffers on a specific device.
-        Args:
-            buffer_device (torch.device, optional): If provided, any large or precomputed
-                buffers (like RoPE frequency tensors) will be allocated or re-created on
-                this device during initialization. This can avoid overhead from transferring
-                buffers between CPU and GPU after creation. If None, buffers default to the
-                device of the first parameter or CPU.
-        Usage:
-            - Pass a GPU device (e.g., ``torch.device('cuda')``) when you want to ensure
-            buffers are created directly on GPU, preventing extra transfers.
-            - Pass a CPU device (e.g., ``torch.device('cpu')``) if you want to keep
-            large buffers in CPU memory (common in CPU-offload or pipeline-parallel setups).
-            - Leave it as ``None`` to rely on the model’s existing parameter device or
-            the default PyTorch device context.
-        When / Why:
-            - Useful in distributed or pipeline-parallel training where parameters may
-            initially live on CPU, but you still need certain buffers on GPU to avoid
-            overhead during forward passes.
-            - Prevents large re-allocations or re-copies when big buffers (like RoPE
-            frequency tables) are needed per rank.
         """
-        super().init_weights()
-    @classmethod
-    def from_model_args(cls, config: Mamba2Config) -> "Mamba2":
         """
-        Initialize a Mamba model from a MambaConfig object.
-        Args:
-            config (MambaConfig): Mamba configuration arguments.
-        Returns:
-            Mamba: Mamba-2 model.
-        """
-        return cls(config)
-def get_mamba2_flops(
-    seq_len: int,
-    dim: int,
-    num_layers: int,
-    vocab_size: int,
-    ffn_multiplier: float = 2.0,
-    state_dim: int = 128,
-    conv_size: int = 4,
-    num_heads: int = 8,
-    num_groups: int = 1,
-    multiple_of: int = 256,
-    include_input_embedding: bool = True,
-    include_output_logits: bool = True,
-    forward_backward_multiplier: float = 1.0,
-) -> int:
-    """
-    Estimate the FLOPs for a Mamba-2 style model using a "Chinchilla-like" shape-based approach.
-    By default, this returns the forward-pass cost. If you want a rough
-    forward+backward estimate, set `forward_backward_multiplier=3.0` (common
-    rule-of-thumb for these models).
-    What gets counted:
-    • Hidden dimension is rounded up to 'multiple_of' = 256 (as in Mamba).
-    • Per-layer:
-        1) Input Linear: [dim → 2*hidden_dim + 2*(groups*state_dim) + num_heads]
-        2) Depthwise Conv1D: 2*(conv_dim * conv_size), where conv_dim=hidden_dim + 2*groups*state_dim
-        3) SSM selective scan: ~9*(dim*state_dim) (from Mamba dev discussion)
-        4) Output Linear: [hidden_dim → dim]
-    • Each layer’s cost is multiplied by (seq_len * num_layers).
-    • Optionally adds:
-        - The cost of the input embedding (treating it as a matmul: seq_len×vocab_size × vocab_size×dim).
-        - The cost of the final projection [dim → vocab_size].
-    • Finally scaled by `forward_backward_multiplier` if desired.
-    Args:
-        seq_len (int): Sequence length (number of tokens).
-        dim (int): Model (embedding) dimension.
-        num_layers (int): Number of Mamba layers.
-        vocab_size (int): Vocabulary size for final logits projection.
-        ffn_multiplier (float): FFN expansion ratio, e.g. 2.0 => hidden_dim=2×dim (rounded up).
-        state_dim (int): SSM state dimension (commonly 128).
-        conv_size (int): Kernel size for the depthwise conv1d (default=4).
-        num_heads (int): Number of heads (slightly affects input-lin out_dim).
-        num_groups (int): For "grouped" states in some Mamba variants (usually 1).
-        multiple_of (int): Round hidden_dim up to this multiple (commonly 256).
-        include_input_embedding (bool): If True, count the cost of an “embedding matmul”
-                                        for the input tokens => shape-based approach.
-        include_output_logits (bool): If True, count the cost of final [dim → vocab_size].
-        forward_backward_multiplier (float): E.g. 1.0 for forward only, 2.0 or 3.0 for forward+backward.
-    Returns:
-        int: Approximate total FLOPs (multiply-adds) for the selected pass(es),
-            as an integer.
-    """
-    # 0) Input embedding (optional)
-    flops_embedding = 0
-    if include_input_embedding:
-        flops_embedding = 2 * (seq_len * vocab_size * dim)
-    # 1) Round up hidden_dim
-    raw_hidden_dim = int(ffn_multiplier * dim)
-    hidden_dim = multiple_of * ((raw_hidden_dim + multiple_of - 1) // multiple_of)
-    # 2) Per-layer forward cost
-    out_dim_input = 2*hidden_dim + 2*(num_groups*state_dim) + num_heads
-    flops_input_linear = 2 * (dim * out_dim_input)
-    conv_dim = hidden_dim + 2*(num_groups*state_dim)
-    flops_conv = 2 * (conv_dim * conv_size)
-    flops_ssm = 9 * state_dim * dim
-    flops_output_linear = 2 * (hidden_dim * dim)
-    flops_layer = (flops_input_linear + flops_conv + flops_ssm + flops_output_linear)
-    # Multiply by #layers and sequence length
-    flops_layers = flops_layer * num_layers * seq_len
-    # 3) Final projection [dim → vocab_size] (optional)
-    flops_vocab = 0
-    if include_output_logits:
-        flops_vocab = 2 * (seq_len * dim * vocab_size)
-    # 4) Total forward FLOPs
-    flops_forward = flops_embedding + flops_layers + flops_vocab
-    # 5) Scale for forward+backward if desired
-    return int(flops_forward * forward_backward_multiplier)
-def get_mamba2_flops_per_token(
-    **kwargs
-) -> float:
-    """
-    Estimate FLOPs per token for a Mamba-2 style model.
-    This function extracts necessary parameters from kwargs and calculates the FLOPs per token.
-    Args:
-        **kwargs: Dictionary containing model configuration parameters.
-    Returns:
-        float: Approximate FLOPs per token.
-    """
-    defaults = {
-        'ffn_dim_multiplier': 2.0,
-        'state_dim': 128,
-        'conv_size': 4,
-        'num_heads': 8,
-        'num_groups': 1,
-        'multiple_of': 256,
-        'include_input_embedding': True,
-        'include_output_logits': True,
-        'forward_backward_multiplier': 1.0,
-    }
-    # Merge defaults
-    for k, v in defaults.items():
-        kwargs.setdefault(k, v)
-    # Mandatory keys
-    for required in ['seq_len', 'dim', 'num_layers', 'vocab_size']:
-        if required not in kwargs:
-            raise ValueError(f"Missing required parameter: {required}")
-    total_flops = get_mamba2_flops(
-        seq_len=kwargs['seq_len'],
-        dim=kwargs['dim'],
-        num_layers=kwargs['num_layers'],
-        vocab_size=kwargs['vocab_size'],
-        ffn_multiplier=kwargs['ffn_dim_multiplier'],
-        state_dim=kwargs['state_dim'],
-        conv_size=kwargs['conv_size'],
-        num_heads=kwargs['num_heads'],
-        num_groups=kwargs['num_groups'],
-        multiple_of=kwargs['multiple_of'],
-        include_input_embedding=kwargs['include_input_embedding'],
-        include_output_logits=kwargs['include_output_logits'],
-        forward_backward_multiplier=kwargs['forward_backward_multiplier'],
-    )
-    flops_per_token = total_flops / kwargs['seq_len']
-    return flops_per_token
-# Optional policy for activation checkpointing. With None, we stick to the default (defined distributed.py: default_no_recompute_ops)
-def get_no_recompute_ops():
-    return {
-        torch.ops.aten.mm.default,
-        torch.ops.aten._scaled_mm.default,
-        torch.ops.c10d_functional.reduce_scatter_tensor.default,
-        torch.ops.mamba_ssm.ssm_chunk_scan_combined_fwd.default,
-        # For low-precision training, it's useful to always save the result of max(abs(tensor))
-        torch.ops.aten.abs.default,
-        torch.ops.aten.max.default,
-    }
-def main():
-    from mamba_ssm import Mamba2 as MambaRef
-    x = torch.randn(2, 64, 192).cuda()
-    # Create and run the first model
-    model = MambaRef(
-        d_model=192,
-        expand=2,
-        d_conv=4,
-        d_state=64,
-        headdim=48,
-    ).cuda()
-    y = model(x)
-    print("Mamba reference output: ", y)
-    print("Mean of MambaRef output: ", y.mean().item())
-    print("Stddev of MambaRef output: ", y.std().item())
-    # Create and run the second model
-    config = Mamba2Config(vocab_size=200064, use_mem_eff_path=True)
-    model2 = Mamba2(
-        config=config,
-    ).cuda()
-    # Fix: Convert x to torch.LongTensor
-    x_indices = torch.randint(0, config.vocab_size, (2, 64), dtype=torch.long).cuda()
-    y2 = model2(x_indices)
-    print("Mamba output: ", y2)
-    print("Mean of Mamba output: ", y2.mean().item())
-    print("Stddev of Mamba output: ", y2.std().item())

+import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers.modeling_outputs import CausalLMOutput
 from .configuration_minimamba import MiniMambaConfig
+from .model import Mamba2, Mamba2Config
+class MiniMamba(PreTrainedModel):
+    """
+    A Hugging Face–style wrapper around a Mamba2 model, providing:
+      • forward(...) returning a CausalLMOutput
+      • support for HF training loops
+      • a naive generate(...) method with top-k/top-p sampling
+    """
+    config_class = MiniMambaConfig  # Tells HF which config class to use
+    def __init__(self, config: MiniMambaConfig) -> None:
+        """
+        Initialize the MiniMamba model, bridging Mamba2 with HF's PreTrainedModel.
+        """
+        super().__init__(config)
+        # If your config includes Mamba2-like parameters, you can build a Mamba2Config from it:
+        mamba2_args = Mamba2Config(
+            vocab_size=config.vocab_size,
+            num_layers=config.n_layers,
+            dim=config.n_embd,
+            use_mem_eff_path=True,
+            weight_tying=config.weight_tying if hasattr(config, "weight_tying") else False,
+            torch_dtype=getattr(torch, config.torch_dtype) if isinstance(config.torch_dtype, str) else config.torch_dtype,
+        )
+        # Internally hold a Mamba2 model
+        self.mamba = Mamba2(config=mamba2_args)
+        # Because HF wants the final linear to be part of this top-level model,
+        # you *can* rely on Mamba2’s built-in embedding + output if you prefer.
+        # Mamba2 already has self.tok_emb and self.output.
+        # So we typically do NOT need a separate embedding or lm_head here.
+        #
+        # We only do so if we want the “HF standard” tie-weights approach:
+        #    self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
+        #    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        #    self.lm_head.weight = self.tok_emb.weight
+        #
+        # But Mamba2 does that internally if config.weight_tying == True.
+        # This is optional: store any device or dtype you might want
+        self.device_ = torch.device(config.device)
+        if isinstance(config.torch_dtype, str):
+            self.dtype_ = getattr(torch, config.torch_dtype)
         else:
+            self.dtype_ = config.torch_dtype
+        # Parameter initialization (HF calls them with self._init_weights in some flows).
         self.apply(self._init_weights)
+        print("MiniMamba Model Parameter Count: %.2fM\n" % (self._get_num_params() / 1e6,))
     def forward(
         self,
+        input_ids: torch.LongTensor,
+        labels: torch.LongTensor = None,
         **kwargs
     ) -> CausalLMOutput:
+        """
+        Forward pass for causal language modeling.
+        Returns a CausalLMOutput that includes loss (if labels is provided) and logits.
+        """
+        # Mamba2's forward expects (x: torch.Tensor, target: torch.Tensor|None, ...)
+        # but we only need the logits from the simple call:
+        logits = self.mamba(input_ids)  # shape: [batch, seq_len, vocab_size]
         loss = None
         if labels is not None:
+            # By default, huggingface GPT-like models shift the logits by one
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             loss_fct = nn.CrossEntropyLoss()
             loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
                 shift_labels.view(-1)
             )
             logits=logits,
         )
+    @torch.no_grad()
     def generate(
         self,
         input_ids: torch.LongTensor,
         **kwargs
     ):
         """
+        A naive token-by-token generation loop (greedy + top-k/top-p + temperature).
         """
+        # We'll accumulate new tokens in generated_ids
         generated_ids = input_ids.clone()
         for _ in range(max_new_tokens):
             outputs = self.forward(generated_ids)
             logits = outputs.logits[:, -1, :]  # shape: (batch_size, vocab_size)
+            # Scale by temperature
             if temperature != 1.0:
                 logits = logits / temperature
+            # Filter
             logits = self.top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+            # Sample next token
+            probs = F.softmax(logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)  # shape: (batch, 1)
+            # Append
             generated_ids = torch.cat([generated_ids, next_token], dim=1)
+            # If we have an EOS token, we can break early if all sequences have ended
+            if eos_token_id is not None and (next_token == eos_token_id).all():
+                break
+        return generated_ids
+    @staticmethod
+    def top_k_top_p_filtering(
+        logits: torch.Tensor,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        filter_value: float = float("-inf"),
+    ):
         """
+        Filters logits using top-k and/or nucleus (top-p) filtering.
         """
+        # top_k
+        if top_k > 0:
+            top_k = min(top_k, logits.size(-1))
+            indices_to_remove = logits < torch.topk(logits, top_k, dim=-1).values[:, -1, None]
+            logits[indices_to_remove] = filter_value
+        # top_p (nucleus)
+        if 0 < top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probs > top_p
+            # Shift right to keep also the first token above threshold
+            sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+            sorted_indices_to_remove[:, 0] = False
+            # Scatter to get back to original indexing
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                dim=1, index=sorted_indices, src=sorted_indices_to_remove
             )
+            logits[indices_to_remove] = filter_value
+        return logits
+    def _init_weights(self, module):
         """
+        HF calls _init_weights to initialize parameters.
+        If you prefer Mamba’s own init approach, you can call model.mamba.init_weights().
         """
+        # As an example, we just call Mamba2's init routine for the entire submodel,
+        # or do some standard PyTorch inits for linear layers, embeddings, etc.
+        if isinstance(module, Mamba2):
+            module.init_weights()  # Mamba2’s internal init
+        elif isinstance(module, nn.Linear):
+            # e.g. standard xavier or normal init
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+        # If needed, do your specialized inits for other modules
+    def _get_num_params(self):
+        # Count trainable params, subtract duplicates if tying weights, etc.
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)