File size: 33,302 Bytes


import torch
if not hasattr(torch.library, 'wrap_triton'):
    def wrap_triton(fn):
        return fn
    torch.library.wrap_triton = wrap_triton

# Fix graph breaks from scalar outputs
import torch._dynamo
torch._dynamo.config.capture_scalar_outputs = True

import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
from typing import Optional, Tuple, Union

from transformers import PreTrainedModel, PretrainedConfig
from transformers.modeling_outputs import MaskedLMOutput, BaseModelOutputWithPast, SequenceClassifierOutput

import bert_padding
from attention import FlexBertUnpadRopeAttention

from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist

try:
    from liger_kernel.transformers import LigerLayerNorm
    LayerNormClass = LigerLayerNorm
except ImportError:
    LayerNormClass = nn.LayerNorm



# HuggingFace-compatible Configuration


class CustomTransformerConfig(PretrainedConfig):
    """
    Configuration class for CustomTransformer model.

    This class stores the configuration of a CustomTransformer model and is compatible
    with HuggingFace's transformers library. It replaces the old ModelConfig dataclass.
    """
    model_type = "custom_transformer"

    # auto_map tells HF which classes to use when loading with AutoModel/AutoConfig
    auto_map = {
        "AutoConfig": "model.CustomTransformerConfig",
        "AutoModel": "model.CustomTransformerModel",
        "AutoModelForMaskedLM": "model.CustomTransformerForMaskedLM",
        "AutoModelForSequenceClassification": "model.CustomTransformerForSequenceClassification",
    }

    def __init__(
        self,
        vocab_size: int = 50368,
        num_dims: int = 768,
        num_heads: int = 12,
        num_kv_heads: int = 12,
        num_layers: int = 12,
        ffn_hidden_dims: int = 1536,
        layernorm_eps: float = 1e-6,
        attention_probs_dropout_prob: float = 0.1,
        attn_qkv_bias: bool = False,
        attn_out_bias: bool = False,
        attn_out_dropout_prob: float = 0.0,
        global_attn_every_n_layers: int = 3,
        sliding_window: int = 128,
        rotary_emb_base: int = 10000,
        context_len: int = 128,
        use_cache: bool = False,
        use_flash: bool = True,
        use_moe: bool = True,
        moe_num_experts: int = 15,
        moe_routed_experts: int = 1,
        moe_eps: float = 1e-6,
        moe_aux_loss_coef: float = 0.01,
        moe_shared_experts: int = 1,
        use_lossfreebalance: bool = True,
        pad_token_id: int = 0,
        bos_token_id: int = 1,
        eos_token_id: int = 2,
        mask_token_id: int = 3,
        rope_theta: float = 1e5,
        ffn_dim_multiplier: Optional[int] = None,
        rotary_emb_dim: Optional[int] = None,
        local_attn_rotary_emb_base: int = -1,
        local_attn_rotary_emb_dim: Optional[int] = None,
        rotary_emb_scale_base: Optional[float] = None,
        rotary_emb_interleaved: bool = False,
        use_fa2: Optional[bool] = None,
        deterministic_fa2: bool = False,
        use_sdpa_attn_mask: bool = False,
        num_labels: int = 2,
        classifier_dropout: Optional[float] = None,
        **kwargs
    ):
        """Initialize CustomTransformerConfig."""
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs
        )

        self.vocab_size = vocab_size
        self.num_dims = num_dims
        self.num_heads = num_heads
        self.num_kv_heads = num_kv_heads
        self.num_layers = num_layers
        self.ffn_hidden_dims = ffn_hidden_dims
        self.layernorm_eps = layernorm_eps
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.attn_qkv_bias = attn_qkv_bias
        self.attn_out_bias = attn_out_bias
        self.attn_out_dropout_prob = attn_out_dropout_prob
        self.global_attn_every_n_layers = global_attn_every_n_layers
        self.sliding_window = sliding_window
        self.rotary_emb_base = rotary_emb_base
        self.context_len = context_len
        self.use_cache = use_cache
        self.use_flash = use_flash
        self.use_moe = use_moe
        self.moe_num_experts = moe_num_experts
        self.moe_routed_experts = moe_routed_experts
        self.moe_eps = moe_eps
        self.moe_aux_loss_coef = moe_aux_loss_coef
        self.moe_shared_experts = moe_shared_experts
        self.use_lossfreebalance = use_lossfreebalance
        self.mask_token_id = mask_token_id
        self.rope_theta = rope_theta
        self.ffn_dim_multiplier = ffn_dim_multiplier
        self.rotary_emb_dim = rotary_emb_dim
        self.local_attn_rotary_emb_base = local_attn_rotary_emb_base
        self.local_attn_rotary_emb_dim = local_attn_rotary_emb_dim
        self.rotary_emb_scale_base = rotary_emb_scale_base
        self.rotary_emb_interleaved = rotary_emb_interleaved
        self.use_fa2 = use_fa2
        self.deterministic_fa2 = deterministic_fa2
        self.use_sdpa_attn_mask = use_sdpa_attn_mask
        self.num_labels = num_labels
        self.classifier_dropout = classifier_dropout

        # Derived attributes for compatibility with attention module
        self.hidden_size = num_dims
        self.num_attention_heads = num_heads
        self.embedding_size = num_dims

        # Mirror old ModelConfig.__post_init__
        if self.use_fa2 is None:
            self.use_fa2 = self.use_flash


# Keep ModelConfig as a thin alias for backward compatibility with existing training scripts
@dataclass
class ModelConfig:
    vocab_size: int

    num_dims: int
    num_heads: int
    num_kv_heads: int
    num_layers: int
    ffn_hidden_dims: int

    context_len: int
    use_cache: bool
    use_flash: bool
    use_moe: bool

    moe_num_experts: int
    moe_routed_experts: int
    moe_eps: float = 1e-6
    moe_aux_loss_coef: float = 0.00
    moe_shared_experts: int = 0
    use_lossfreebalance: bool = False

    layernorm_eps: float = 1e-6
    rope_theta: float = 1e5

    attention_probs_dropout_prob: float = 0.0
    attn_qkv_bias: bool = False
    attn_out_bias: bool = False
    attn_out_dropout_prob: float = 0.0
    global_attn_every_n_layers: int = 0
    sliding_window: int = -1
    rotary_emb_dim: Optional[int] = None
    rotary_emb_base: Optional[float] = None
    local_attn_rotary_emb_base: int = -1
    local_attn_rotary_emb_dim: Optional[int] = None
    rotary_emb_scale_base: Optional[float] = None
    rotary_emb_interleaved: bool = False
    use_fa2: Optional[bool] = None
    deterministic_fa2: bool = False
    use_sdpa_attn_mask: bool = False
    hidden_size: Optional[int] = None
    num_attention_heads: Optional[int] = None
    embedding_size: Optional[int] = None

    ffn_dim_multiplier: Optional[int] = None

    def __post_init__(self):
        if self.hidden_size is None:
            self.hidden_size = self.num_dims
        if self.num_attention_heads is None:
            self.num_attention_heads = self.num_heads
        if self.rotary_emb_base is None:
            self.rotary_emb_base = self.rope_theta
        if self.use_fa2 is None:
            self.use_fa2 = self.use_flash


# Model Layers

class FlexBertUnpadAttention(nn.Module):
    """Thin wrapper that preserves the state_dict key path: block.attention.attn.*

    In ModernBERT-style global unpadding the data is already (total_nnz, dim) so
    this wrapper just forwards directly to FlexBertUnpadRopeAttention without
    any pad/unpad work.  cu_seqlens, max_seqlen, indices, and attn_mask are
    passed through from the Transformer level.
    """
    def __init__(self, config, layer_id: Optional[int] = None):
        super().__init__()
        self.attn = FlexBertUnpadRopeAttention(config=config, layer_id=layer_id)

    def forward(
        self,
        hidden_states: torch.Tensor,
        cu_seqlens: torch.Tensor,
        max_seqlen: int,
        indices: torch.Tensor,
        attn_mask: torch.Tensor,
    ) -> torch.Tensor:
        """Forward on already-unpadded data.

        Args:
            hidden_states: (total_nnz, dim)
            cu_seqlens: (batch + 1,)
            max_seqlen: int
            indices: (total_nnz,)
            attn_mask: (batch, seq_len)

        Returns:
            (total_nnz, dim)
        """
        return self.attn(
            hidden_states=hidden_states,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            indices=indices,
            attn_mask=attn_mask,
        )


class FeedForward(nn.Module):
    """Default Feed Forward Layer.  Works on both 2D (total_nnz, dim) and 3D inputs."""
    def __init__(self, config):
        super().__init__()

        self.hidden_dim = config.ffn_hidden_dims

        self.w1 = nn.Linear(config.num_dims, self.hidden_dim, bias=False)
        self.w2 = nn.Linear(self.hidden_dim, config.num_dims, bias=False)
        self.w3 = nn.Linear(config.num_dims, self.hidden_dim, bias=False)
        self.act = nn.GELU()

    def forward(self, x: torch.Tensor):
        return self.w2(self.act(self.w1(x)) * self.w3(x)), None


class FFNwMoE(nn.Module):
    """
    Feed Forward with MoE with optional shared experts.
    Works on 2D (total_nnz, dim) unpadded inputs.

    Uses batched_mm (torch.bmm) for expert dispatch. Expert weights are stored
    as stacked nn.Parameters: (num_experts, out_dim, in_dim). Old checkpoints
    with per-expert nn.Linear weights are automatically converted at load time
    via _load_from_state_dict.

    Returns after forward:
        output: Combined outputs from experts
        aux_loss: Auxiliary loss tensor or routing metadata
    """
    def __init__(self, config):
        super().__init__()
        self.hidden_dim = config.ffn_hidden_dims
        self.num_dims = config.num_dims

        self.moe_routed_experts = config.moe_routed_experts
        self.moe_aux_loss_coef = config.moe_aux_loss_coef
        self.moe_eps = config.moe_eps
        self.moe_shared_experts = config.moe_shared_experts
        self.num_experts = config.moe_num_experts

        self.use_lossfreebalance = config.use_lossfreebalance

        self.router = nn.Linear(config.num_dims, self.num_experts, bias=False)

        # Stacked expert weights — the actual trainable parameters
        # w1: projects dim -> hidden (gate)
        # w2: projects hidden -> dim (down)
        # w3: projects dim -> hidden (up)
        self.w1_stacked = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, config.num_dims))
        self.w2_stacked = nn.Parameter(torch.empty(self.num_experts, config.num_dims, self.hidden_dim))
        self.w3_stacked = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, config.num_dims))

        # Initialize
        for i in range(self.num_experts):
            nn.init.kaiming_uniform_(self.w1_stacked.data[i])
            nn.init.kaiming_uniform_(self.w2_stacked.data[i])
            nn.init.kaiming_uniform_(self.w3_stacked.data[i])

        # shared experts (for DeepSeekMoE)
        self.shared_experts = nn.ModuleList()
        for _ in range(self.moe_shared_experts):
            self.shared_experts.append(
                nn.ModuleList([
                    nn.Linear(config.num_dims, self.hidden_dim, bias=False),
                    nn.Linear(self.hidden_dim, config.num_dims, bias=False),
                    nn.Linear(config.num_dims, self.hidden_dim, bias=False)
                ]))

        # Auxiliary-loss-free load balancing strategy for MoE (DeepSeek)
        if self.use_lossfreebalance:
            self.expert_biases = nn.Parameter(torch.zeros(self.num_experts))

    def forward(self, x: torch.Tensor):
        # x can be (total_nnz, dim) or (batch, seq_len, dim)
        input_shape = x.shape
        if x.ndim == 3:
            c_batch_size, c_context_len, c_dim = input_shape
            x_flat = x.view(-1, c_dim)
        else:
            x_flat = x
            c_dim = x.shape[-1]

        router_out = self.router(x_flat)
        router_probs = F.softmax(router_out, dim=-1)

        _, topk_indices = router_out.topk(self.moe_routed_experts, dim=-1)
        self.last_topk_indices = topk_indices.detach()

        aux_loss, topk_probs = self._compute_aux_loss(router_out, router_probs, topk_indices)

        output = self._compute_expert_outputs(x_flat, topk_indices, topk_probs, router_probs)

        if x.ndim == 3:
            output = output.view(c_batch_size, c_context_len, c_dim)

        return output, aux_loss

    def _compute_aux_loss(self, router_out, router_probs, topk_indices):
        """Computes the auxiliary loss based on whether loss-free balancing is used or not."""
        if not self.use_lossfreebalance:
            topk_probs, _ = router_probs.topk(self.moe_routed_experts, dim=-1)
            expert_mask = F.one_hot(topk_indices[:, 0], self.num_experts).float()
            density = expert_mask.mean(dim=0)
            router_prob_mean = router_probs.mean(dim=0)
            aux_loss = self.moe_aux_loss_coef * torch.sum(density * router_prob_mean) * self.num_experts

        else:
            router_out = router_out + self.expert_biases
            router_probs = torch.sigmoid(router_out)
            topk_probs = router_probs.gather(-1, topk_indices)
            topk_probs = topk_probs / topk_probs.sum(dim=-1, keepdim=True)

            aux_loss = (router_probs, topk_indices)
        return aux_loss, topk_probs

    def _compute_expert_outputs(self, x_flat, topk_indices, topk_probs, router_probs):
        """Compute expert outputs using sort-based dispatch with stacked weights.

        Sort tokens by expert, slice contiguous chunks, run each expert via
        matmul on the stacked weight tensors. No weight duplication, minimal
        memory overhead.
        """
        num_tokens, dim = x_flat.shape

        # Flatten top-k: (num_tokens * top_k,)
        flat_expert_ids = topk_indices.view(-1)
        flat_probs = topk_probs.view(-1)
        flat_token_ids = torch.arange(num_tokens, device=x_flat.device).unsqueeze(1).expand(-1, self.moe_routed_experts).reshape(-1)

        # Sort by expert id for contiguous batching
        sorted_expert_ids, sort_indices = flat_expert_ids.sort(stable=True)
        sorted_token_ids = flat_token_ids[sort_indices]
        sorted_probs = flat_probs[sort_indices]

        # Gather sorted input tokens
        sorted_x = x_flat[sorted_token_ids]  # (num_tokens * top_k, dim)

        # Find expert boundaries
        expert_counts = torch.bincount(sorted_expert_ids, minlength=self.num_experts)
        expert_offsets = torch.zeros(self.num_experts + 1, dtype=torch.long, device=x_flat.device)
        torch.cumsum(expert_counts, dim=0, out=expert_offsets[1:])

        # Run each expert on its contiguous slice using stacked weights
        sorted_output = torch.zeros_like(sorted_x)
        for expert_id in range(self.num_experts):
            start = expert_offsets[expert_id].item()
            end = expert_offsets[expert_id + 1].item()
            if start == end:
                continue
            expert_input = sorted_x[start:end]  # (n_tokens, dim)
            # Use stacked weights directly: w1[expert_id] is (hidden, dim)
            h1 = F.linear(expert_input, self.w1_stacked[expert_id])  # (n, hidden)
            h3 = F.linear(expert_input, self.w3_stacked[expert_id])  # (n, hidden)
            h = F.gelu(h1) * h3
            sorted_output[start:end] = F.linear(h, self.w2_stacked[expert_id])  # (n, dim)

        # Weight by router probabilities
        sorted_output = sorted_output * sorted_probs.unsqueeze(-1)

        # Scatter back to original token positions
        output = torch.zeros_like(x_flat)
        output.scatter_add_(0, sorted_token_ids.unsqueeze(-1).expand_as(sorted_output), sorted_output)

        # Shared experts (for DeepSeekMoE) — unchanged
        for shared_expert_id in range(self.moe_shared_experts):
            w1, w2, w3 = self.shared_experts[shared_expert_id]
            expert_output = w2(F.gelu(w1(x_flat)) * w3(x_flat))
            output = output + expert_output

        return output


class Block(nn.Module):
    """Transformer block operating on unpadded (total_nnz, dim) tensors.

    Receives unpadding metadata (cu_seqlens, max_seqlen, indices, attn_mask)
    from the Transformer level and passes them to attention.  Norms and FFN
    operate directly on the 2D unpadded tensor, avoiding wasted compute on
    padding tokens.
    """
    def __init__(self, config, layer_id: Optional[int] = None):
        super().__init__()
        self.is_first_block = (layer_id == 0)

        self.attention = FlexBertUnpadAttention(config, layer_id=layer_id)
        if config.use_moe:
            self.ffn = FFNwMoE(config)
        else:
            self.ffn = FeedForward(config)

        self.norm_attention = LayerNormClass(config.num_dims, eps=config.layernorm_eps)
        self.norm_ffn = LayerNormClass(config.num_dims, eps=config.layernorm_eps)

    def forward(self, x, cu_seqlens, max_seqlen, indices, attn_mask):
        """
        Args:
            x: (total_nnz, dim) - unpadded hidden states
            cu_seqlens: (batch + 1,)
            max_seqlen: int
            indices: (total_nnz,)
            attn_mask: (batch, seq_len)

        Returns:
            x: (total_nnz, dim)
            aux_loss: auxiliary loss from MoE or None
        """
        if self.is_first_block:
            attn_in = x
        else:
            attn_in = self.norm_attention(x)

        x = x + self.attention(
            attn_in,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            indices=indices,
            attn_mask=attn_mask,
        )

        ffn_out, aux_loss = self.ffn(
            self.norm_ffn(x)
        )
        x = x + ffn_out
        return x, aux_loss



# Core Transformer (nn.Module backbone used inside HF wrappers)

class Transformer(nn.Module):
    """ModernBERT-style Transformer: unpad once before embeddings, repad once at
    the end.  All blocks, norms, and FFNs operate on (total_nnz, dim) tensors,
    avoiding wasted compute on padding tokens.
    """
    def __init__(self, config):
        super().__init__()

        self.vocab_size = config.vocab_size
        self.num_dims = config.num_dims
        self.num_heads = config.num_heads
        self.context_len = config.context_len
        self.use_moe = config.use_moe
        self.use_lossfreebalance = config.use_lossfreebalance and self.use_moe

        self.num_layers = config.num_layers

        hidden_dim = 4 * config.num_dims

        self.tokens_embedding = nn.Embedding(config.vocab_size, config.num_dims)
        self.norm_embeddings = LayerNormClass(config.num_dims, eps=config.layernorm_eps)

        self.blocks = nn.ModuleList()
        for layer_id in range(self.num_layers):
            self.blocks.append(Block(config, layer_id=layer_id))

        self.norm = LayerNormClass(config.num_dims, eps=config.layernorm_eps)
        self.ll_head = nn.Linear(config.num_dims, config.vocab_size, bias=False)

        self.tokens_embedding.weight = self.ll_head.weight

    def _unpad(self, input_ids, attention_mask):
        """Compute unpadding metadata and unpad input_ids before embedding.

        Unpads input_ids (cheap 1D integer indexing) so that embedding and
        all subsequent layers only process real tokens.

        Args:
            input_ids: (batch, seq_len)
            attention_mask: (batch, seq_len) or None

        Returns:
            input_ids_unpadded: (total_nnz,)
            indices: (total_nnz,)
            cu_seqlens: (batch + 1,)
            max_seqlen: int
            attn_mask: (batch, seq_len)
            batch_size: int
            seq_len: int
        """
        batch_size, seq_len = input_ids.shape

        if attention_mask is None:
            attn_mask = torch.ones((batch_size, seq_len), device=input_ids.device, dtype=torch.int32)
        else:
            attn_mask = attention_mask.to(dtype=torch.int32)

        # Unpad input_ids using the same bert_padding logic but on (batch, seq_len, 1)
        # so we can reuse unpad_input which expects 3D
        input_ids_3d = input_ids.unsqueeze(-1).float()  # (batch, seq_len, 1)
        input_ids_unpadded, indices, cu_seqlens, max_seqlen = bert_padding.unpad_input(input_ids_3d, attn_mask)
        input_ids_unpadded = input_ids_unpadded.squeeze(-1).long()  # (total_nnz,)

        return input_ids_unpadded, indices, cu_seqlens, max_seqlen, attn_mask, batch_size, seq_len

    def forward(
        self,
        x: torch.Tensor,
        targets: Optional[torch.Tensor] = None,
        start_pos: int = 0,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        batch_size, seq_len = x.shape

        # Unpad input_ids before embedding — only embed real tokens
        x_unpadded, indices, cu_seqlens, max_seqlen, attn_mask, batch_size, seq_len = self._unpad(x, attention_mask)

        # Embed only real tokens (total_nnz, dim)
        x = self.tokens_embedding(x_unpadded)
        x = self.norm_embeddings(x)

        total_aux_loss = 0

        for block in self.blocks:
            x, aux_loss = block(
                x,
                cu_seqlens=cu_seqlens,
                max_seqlen=max_seqlen,
                indices=indices,
                attn_mask=attn_mask,
            )
            if self.use_moe and not self.use_lossfreebalance:
                total_aux_loss += aux_loss

        x = self.norm(x)

        # Repad once — back to (batch, seq_len, dim) for the LM head / loss
        x = bert_padding.pad_input(x, indices, batch_size, seq_len)

        logits = self.ll_head(x)

        if targets is None:
            loss = None
            ce_loss = None
        else:
            c_batch_size, c_context_len, c_dim = logits.shape
            logits = logits.view(c_batch_size * c_context_len, c_dim)
            targets = targets.view(c_batch_size * c_context_len)
            ce_loss = F.cross_entropy(logits, targets)

            if self.use_moe and not self.use_lossfreebalance:
                loss = ce_loss + total_aux_loss
            else:
                loss = ce_loss
                ce_loss = aux_loss

        return logits, loss, ce_loss

    @torch.no_grad()
    def generate(self, x: torch.Tensor, max_tokens: int, temperature: float = 1.0, top_k: int = 50,
                 use_cache: bool = False):
        """Generate text from x up to max_tokens."""
        for c_tkn_pos in range(max_tokens):
            if use_cache:
                if c_tkn_pos == 0:
                    logits, _, ce_loss = self.forward(x, start_pos=c_tkn_pos)
                else:
                    logits, _, ce_loss = self.forward(x[:, -1:], start_pos=c_tkn_pos)
            else:
                logits, _, ce_loss = self.forward(x)

            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                tkl, idx = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < tkl[:, [-1]]] = -float('Inf')

            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            x = torch.cat((x, next_token), dim=1)
        return x



# HuggingFace PreTrainedModel Wrappers

class CustomTransformerPreTrainedModel(PreTrainedModel):
    """Base class for CustomTransformer models."""
    config_class = CustomTransformerConfig
    base_model_prefix = "transformer"
    supports_gradient_checkpointing = False
    _no_split_modules = ["Block"]

    def _init_weights(self, module):
        """Initialize weights - handled by model itself."""
        pass


class CustomTransformerModel(CustomTransformerPreTrainedModel):
    """The bare CustomTransformer Model outputting raw hidden-states."""

    def __init__(self, config: CustomTransformerConfig):
        super().__init__(config)
        self.config = config

        self.transformer = Transformer(config)

        self.post_init()

    def get_input_embeddings(self):
        return self.transformer.tokens_embedding

    def set_input_embeddings(self, value):
        self.transformer.tokens_embedding = value

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        """Forward pass returning raw hidden states."""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Unpad input_ids before embedding
        x_unpadded, indices, cu_seqlens, max_seqlen, attn_mask, batch_size, seq_len = self.transformer._unpad(input_ids, attention_mask)

        # Embed only real tokens
        x = self.transformer.tokens_embedding(x_unpadded)
        x = self.transformer.norm_embeddings(x)

        for block in self.transformer.blocks:
            x, _ = block(x, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, indices=indices, attn_mask=attn_mask)

        x = self.transformer.norm(x)

        # Repad once
        hidden_states = bert_padding.pad_input(x, indices, batch_size, seq_len)

        if not return_dict:
            return (hidden_states,)

        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=None,
            hidden_states=None,
            attentions=None,
        )


class CustomTransformerForMaskedLM(CustomTransformerPreTrainedModel):
    """CustomTransformer Model with a masked language modeling head on top."""
    _tied_weights_keys = ["transformer.ll_head.weight", "transformer.tokens_embedding.weight"]

    def __init__(self, config: CustomTransformerConfig):
        super().__init__(config)
        self.config = config

        self.transformer = Transformer(config)

        self.post_init()

    def get_input_embeddings(self):
        return self.transformer.tokens_embedding

    def set_input_embeddings(self, value):
        self.transformer.tokens_embedding = value

    def get_output_embeddings(self):
        return self.transformer.ll_head

    def set_output_embeddings(self, new_embeddings):
        self.transformer.ll_head = new_embeddings

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, MaskedLMOutput]:
        """Forward pass for masked language modeling."""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        logits, model_loss, ce_loss = self.transformer(
            input_ids, targets=labels, start_pos=0, attention_mask=attention_mask
        )

        masked_lm_loss = None
        if labels is not None:
            masked_lm_loss = model_loss

        if not return_dict:
            output = (logits,)
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=logits,
            hidden_states=None,
            attentions=None,
        )


class CustomTransformerForSequenceClassification(CustomTransformerPreTrainedModel):
    """CustomTransformer Model with a sequence classification head on top."""

    def __init__(self, config: CustomTransformerConfig):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.transformer = Transformer(config)

        # Classification head
        classifier_dropout = (
            config.classifier_dropout
            if config.classifier_dropout is not None
            else config.attention_probs_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.num_dims, config.num_labels)

        self._init_classifier_weights()
        self.post_init()

    def _init_classifier_weights(self):
        std = 0.02
        if isinstance(self.classifier, nn.Linear):
            self.classifier.weight.data.normal_(mean=0.0, std=std)
            if self.classifier.bias is not None:
                self.classifier.bias.data.zero_()

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        """Forward pass for sequence classification."""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

        # Unpad input_ids before embedding
        x_unpadded, indices, cu_seqlens, max_seqlen, attn_mask, batch_size, seq_len = self.transformer._unpad(input_ids, attention_mask)

        # Embed only real tokens
        x = self.transformer.tokens_embedding(x_unpadded)
        x = self.transformer.norm_embeddings(x)

        # Collect hidden states if requested (repad each for the output tuple)
        all_hidden_states = () if output_hidden_states else None

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (bert_padding.pad_input(x, indices, batch_size, seq_len),)

        for block in self.transformer.blocks:
            x, _ = block(x, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, indices=indices, attn_mask=attn_mask)

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (bert_padding.pad_input(x, indices, batch_size, seq_len),)

        x = self.transformer.norm(x)

        # Repad once
        hidden_states = bert_padding.pad_input(x, indices, batch_size, seq_len)

        # Use [CLS] token representation (first token) for classification
        pooled_output = hidden_states[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + (all_hidden_states,) + (None,)
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=all_hidden_states,
            attentions=None,
        )



# Auto-registration

try:
    from transformers import AutoConfig, AutoModel, AutoModelForMaskedLM, AutoModelForSequenceClassification

    AutoConfig.register("custom_transformer", CustomTransformerConfig)
    AutoModel.register(CustomTransformerConfig, CustomTransformerModel)
    AutoModelForMaskedLM.register(CustomTransformerConfig, CustomTransformerForMaskedLM)
    AutoModelForSequenceClassification.register(CustomTransformerConfig, CustomTransformerForSequenceClassification)
except Exception:
    pass


def main():
    pass


if __name__ == "__main__":
    main()