mjerome89
/

moebertyc4stacked

@@ -9,13 +9,17 @@ if not hasattr(torch.library, 'wrap_triton'):
 import torch._dynamo
 torch._dynamo.config.capture_scalar_outputs = True
 import torch.nn as nn
 import torch.nn.functional as F
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-from transformers import PreTrainedModel, PretrainedConfig
-from transformers.modeling_outputs import MaskedLMOutput, BaseModelOutputWithPast, SequenceClassifierOutput
 import bert_padding
 from attention import FlexBertUnpadRopeAttention
@@ -24,157 +28,28 @@ from torch.distributed import init_process_group, destroy_process_group
 from torch.nn.parallel import DistributedDataParallel as DDP
 import torch.distributed as dist
-try:
-    from liger_kernel.transformers import LigerLayerNorm
-    LayerNormClass = LigerLayerNorm
-except ImportError:
-    LayerNormClass = nn.LayerNorm
-# ==============================================================================
-# HuggingFace-compatible Configuration
-# ==============================================================================
-class CustomTransformerConfig(PretrainedConfig):
-    """
-    Configuration class for CustomTransformer model.
-    This class stores the configuration of a CustomTransformer model and is compatible
-    with HuggingFace's transformers library. It replaces the old ModelConfig dataclass.
-    """
-    model_type = "custom_transformer"
-    # auto_map tells HF which classes to use when loading with AutoModel/AutoConfig
-    auto_map = {
-        "AutoConfig": "model.CustomTransformerConfig",
-        "AutoModel": "model.CustomTransformerModel",
-        "AutoModelForMaskedLM": "model.CustomTransformerForMaskedLM",
-        "AutoModelForSequenceClassification": "model.CustomTransformerForSequenceClassification",
-    }
-    def __init__(
-        self,
-        vocab_size: int = 50368,
-        num_dims: int = 768,
-        num_heads: int = 12,
-        num_kv_heads: int = 12,
-        num_layers: int = 12,
-        ffn_hidden_dims: int = 1536,
-        layernorm_eps: float = 1e-6,
-        attention_probs_dropout_prob: float = 0.1,
-        attn_qkv_bias: bool = False,
-        attn_out_bias: bool = False,
-        attn_out_dropout_prob: float = 0.0,
-        global_attn_every_n_layers: int = 3,
-        sliding_window: int = 128,
-        rotary_emb_base: int = 10000,
-        context_len: int = 128,
-        use_cache: bool = False,
-        use_flash: bool = True,
-        use_moe: bool = True,
-        moe_num_experts: int = 15,
-        moe_routed_experts: int = 1,
-        moe_eps: float = 1e-6,
-        moe_aux_loss_coef: float = 0.01,
-        moe_shared_experts: int = 1,
-        use_lossfreebalance: bool = True,
-        pad_token_id: int = 0,
-        bos_token_id: int = 1,
-        eos_token_id: int = 2,
-        mask_token_id: int = 3,
-        rope_theta: float = 1e5,
-        ffn_dim_multiplier: Optional[int] = None,
-        rotary_emb_dim: Optional[int] = None,
-        local_attn_rotary_emb_base: int = -1,
-        local_attn_rotary_emb_dim: Optional[int] = None,
-        rotary_emb_scale_base: Optional[float] = None,
-        rotary_emb_interleaved: bool = False,
-        use_fa2: Optional[bool] = None,
-        deterministic_fa2: bool = False,
-        use_sdpa_attn_mask: bool = False,
-        num_labels: int = 2,
-        classifier_dropout: Optional[float] = None,
-        **kwargs
-    ):
-        """Initialize CustomTransformerConfig."""
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            **kwargs
-        )
-        self.vocab_size = vocab_size
-        self.num_dims = num_dims
-        self.num_heads = num_heads
-        self.num_kv_heads = num_kv_heads
-        self.num_layers = num_layers
-        self.ffn_hidden_dims = ffn_hidden_dims
-        self.layernorm_eps = layernorm_eps
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.attn_qkv_bias = attn_qkv_bias
-        self.attn_out_bias = attn_out_bias
-        self.attn_out_dropout_prob = attn_out_dropout_prob
-        self.global_attn_every_n_layers = global_attn_every_n_layers
-        self.sliding_window = sliding_window
-        self.rotary_emb_base = rotary_emb_base
-        self.context_len = context_len
-        self.use_cache = use_cache
-        self.use_flash = use_flash
-        self.use_moe = use_moe
-        self.moe_num_experts = moe_num_experts
-        self.moe_routed_experts = moe_routed_experts
-        self.moe_eps = moe_eps
-        self.moe_aux_loss_coef = moe_aux_loss_coef
-        self.moe_shared_experts = moe_shared_experts
-        self.use_lossfreebalance = use_lossfreebalance
-        self.mask_token_id = mask_token_id
-        self.rope_theta = rope_theta
-        self.ffn_dim_multiplier = ffn_dim_multiplier
-        self.rotary_emb_dim = rotary_emb_dim
-        self.local_attn_rotary_emb_base = local_attn_rotary_emb_base
-        self.local_attn_rotary_emb_dim = local_attn_rotary_emb_dim
-        self.rotary_emb_scale_base = rotary_emb_scale_base
-        self.rotary_emb_interleaved = rotary_emb_interleaved
-        self.use_fa2 = use_fa2
-        self.deterministic_fa2 = deterministic_fa2
-        self.use_sdpa_attn_mask = use_sdpa_attn_mask
-        self.num_labels = num_labels
-        self.classifier_dropout = classifier_dropout
-        # Derived attributes for compatibility with attention module
-        self.hidden_size = num_dims
-        self.num_attention_heads = num_heads
-        self.embedding_size = num_dims
-        # Mirror old ModelConfig.__post_init__
-        if self.use_fa2 is None:
-            self.use_fa2 = self.use_flash
-# Keep ModelConfig as a thin alias for backward compatibility with existing training scripts
 @dataclass
 class ModelConfig:
     vocab_size: int
-    num_dims: int
-    num_heads: int
-    num_kv_heads: int
-    num_layers: int
-    ffn_hidden_dims: int
-    context_len: int
-    use_cache: bool
-    use_flash: bool
-    use_moe: bool
-    moe_num_experts: int
-    moe_routed_experts: int
-    moe_eps: float = 1e-6
-    moe_aux_loss_coef: float = 0.01
-    moe_shared_experts: int = 0
-    use_lossfreebalance: bool = False
     layernorm_eps: float = 1e-6
     rope_theta: float = 1e5
@@ -198,7 +73,7 @@ class ModelConfig:
     num_attention_heads: Optional[int] = None
     embedding_size: Optional[int] = None
-    ffn_dim_multiplier: Optional[int] = None
     def __post_init__(self):
         if self.hidden_size is None:
@@ -211,53 +86,206 @@ class ModelConfig:
             self.use_fa2 = self.use_flash
-# ==============================================================================
-# Model Layers
-# ==============================================================================
-class FlexBertUnpadAttention(nn.Module):
-    """Thin wrapper that preserves the state_dict key path: block.attention.attn.*
-    In ModernBERT-style global unpadding the data is already (total_nnz, dim) so
-    this wrapper just forwards directly to FlexBertUnpadRopeAttention without
-    any pad/unpad work.  cu_seqlens, max_seqlen, indices, and attn_mask are
-    passed through from the Transformer level.
-    """
     def __init__(self, config, layer_id: Optional[int] = None):
         super().__init__()
         self.attn = FlexBertUnpadRopeAttention(config=config, layer_id=layer_id)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        cu_seqlens: torch.Tensor,
-        max_seqlen: int,
-        indices: torch.Tensor,
-        attn_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        """Forward on already-unpadded data.
-        Args:
-            hidden_states: (total_nnz, dim)
-            cu_seqlens: (batch + 1,)
-            max_seqlen: int
-            indices: (total_nnz,)
-            attn_mask: (batch, seq_len)
-        Returns:
-            (total_nnz, dim)
-        """
-        return self.attn(
             hidden_states=hidden_states,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
             indices=indices,
             attn_mask=attn_mask,
         )
 class FeedForward(nn.Module):
-    """Default Feed Forward Layer.  Works on both 2D (total_nnz, dim) and 3D inputs."""
     def __init__(self, config):
         super().__init__()
@@ -267,54 +295,40 @@ class FeedForward(nn.Module):
         self.w2 = nn.Linear(self.hidden_dim, config.num_dims, bias=False)
         self.w3 = nn.Linear(config.num_dims, self.hidden_dim, bias=False)
         self.act = nn.GELU()
     def forward(self, x: torch.Tensor):
         return self.w2(self.act(self.w1(x)) * self.w3(x)), None
-class FFNwMoE(nn.Module):
     """
     Feed Forward with MoE with optional shared experts.
-    Works on 2D (total_nnz, dim) unpadded inputs.
-    Uses batched_mm (torch.bmm) for expert dispatch. Expert weights are stored
-    as stacked nn.Parameters: (num_experts, out_dim, in_dim). Old checkpoints
-    with per-expert nn.Linear weights are automatically converted at load time
-    via _load_from_state_dict.
     Returns after forward:
         output: Combined outputs from experts
         aux_loss: Auxiliary loss tensor or routing metadata
     """
-    def __init__(self, config):
         super().__init__()
         self.hidden_dim = config.ffn_hidden_dims
-        self.num_dims = config.num_dims
-        self.moe_routed_experts = config.moe_routed_experts
         self.moe_aux_loss_coef = config.moe_aux_loss_coef
         self.moe_eps = config.moe_eps
         self.moe_shared_experts = config.moe_shared_experts
         self.num_experts = config.moe_num_experts
-        self.use_lossfreebalance = config.use_lossfreebalance
-        self.router = nn.Linear(config.num_dims, self.num_experts, bias=False)
-        # Stacked expert weights — the actual trainable parameters
-        # w1: projects dim -> hidden (gate)
-        # w2: projects hidden -> dim (down)
-        # w3: projects dim -> hidden (up)
-        self.w1_stacked = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, config.num_dims))
-        self.w2_stacked = nn.Parameter(torch.empty(self.num_experts, config.num_dims, self.hidden_dim))
-        self.w3_stacked = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, config.num_dims))
-        # Initialize
-        for i in range(self.num_experts):
-            nn.init.kaiming_uniform_(self.w1_stacked.data[i])
-            nn.init.kaiming_uniform_(self.w2_stacked.data[i])
-            nn.init.kaiming_uniform_(self.w3_stacked.data[i])
         # shared experts (for DeepSeekMoE)
         self.shared_experts = nn.ModuleList()
         for _ in range(self.moe_shared_experts):
@@ -324,23 +338,17 @@ class FFNwMoE(nn.Module):
                     nn.Linear(self.hidden_dim, config.num_dims, bias=False),
                     nn.Linear(config.num_dims, self.hidden_dim, bias=False)
                 ]))
-        # Auxiliary-loss-free load balancing strategy for MoE (DeepSeek)
         if self.use_lossfreebalance:
             self.expert_biases = nn.Parameter(torch.zeros(self.num_experts))
     def forward(self, x: torch.Tensor):
-        # x can be (total_nnz, dim) or (batch, seq_len, dim)
-        input_shape = x.shape
-        if x.ndim == 3:
-            c_batch_size, c_context_len, c_dim = input_shape
-            x_flat = x.view(-1, c_dim)
-        else:
-            x_flat = x
-            c_dim = x.shape[-1]
         router_out = self.router(x_flat)
-        router_probs = F.softmax(router_out, dim=-1)
         _, topk_indices = router_out.topk(self.moe_routed_experts, dim=-1)
         self.last_topk_indices = topk_indices.detach()
@@ -349,13 +357,12 @@ class FFNwMoE(nn.Module):
         output = self._compute_expert_outputs(x_flat, topk_indices, topk_probs, router_probs)
-        if x.ndim == 3:
-            output = output.view(c_batch_size, c_context_len, c_dim)
-        return output, aux_loss
     def _compute_aux_loss(self, router_out, router_probs, topk_indices):
-        """Computes the auxiliary loss based on whether loss-free balancing is used or not."""
         if not self.use_lossfreebalance:
             topk_probs, _ = router_probs.topk(self.moe_routed_experts, dim=-1)
             expert_mask = F.one_hot(topk_indices[:, 0], self.num_experts).float()
@@ -363,80 +370,47 @@ class FFNwMoE(nn.Module):
             router_prob_mean = router_probs.mean(dim=0)
             aux_loss = self.moe_aux_loss_coef * torch.sum(density * router_prob_mean) * self.num_experts
-        else:
             router_out = router_out + self.expert_biases
-            router_probs = torch.sigmoid(router_out)
             topk_probs = router_probs.gather(-1, topk_indices)
             topk_probs = topk_probs / topk_probs.sum(dim=-1, keepdim=True)
             aux_loss = (router_probs, topk_indices)
         return aux_loss, topk_probs
     def _compute_expert_outputs(self, x_flat, topk_indices, topk_probs, router_probs):
-        """Compute expert outputs using sort-based dispatch with stacked weights.
-        Sort tokens by expert, slice contiguous chunks, run each expert via
-        matmul on the stacked weight tensors. No weight duplication, minimal
-        memory overhead.
         """
-        num_tokens, dim = x_flat.shape
-        # Flatten top-k: (num_tokens * top_k,)
-        flat_expert_ids = topk_indices.view(-1)
-        flat_probs = topk_probs.view(-1)
-        flat_token_ids = torch.arange(num_tokens, device=x_flat.device).unsqueeze(1).expand(-1, self.moe_routed_experts).reshape(-1)
-        # Sort by expert id for contiguous batching
-        sorted_expert_ids, sort_indices = flat_expert_ids.sort(stable=True)
-        sorted_token_ids = flat_token_ids[sort_indices]
-        sorted_probs = flat_probs[sort_indices]
-        # Gather sorted input tokens
-        sorted_x = x_flat[sorted_token_ids]  # (num_tokens * top_k, dim)
-        # Find expert boundaries
-        expert_counts = torch.bincount(sorted_expert_ids, minlength=self.num_experts)
-        expert_offsets = torch.zeros(self.num_experts + 1, dtype=torch.long, device=x_flat.device)
-        torch.cumsum(expert_counts, dim=0, out=expert_offsets[1:])
-        # Run each expert on its contiguous slice using stacked weights
-        sorted_output = torch.zeros_like(sorted_x)
-        for expert_id in range(self.num_experts):
-            start = expert_offsets[expert_id].item()
-            end = expert_offsets[expert_id + 1].item()
-            if start == end:
-                continue
-            expert_input = sorted_x[start:end]  # (n_tokens, dim)
-            # Use stacked weights directly: w1[expert_id] is (hidden, dim)
-            h1 = F.linear(expert_input, self.w1_stacked[expert_id])  # (n, hidden)
-            h3 = F.linear(expert_input, self.w3_stacked[expert_id])  # (n, hidden)
-            h = F.gelu(h1) * h3
-            sorted_output[start:end] = F.linear(h, self.w2_stacked[expert_id])  # (n, dim)
-        # Weight by router probabilities
-        sorted_output = sorted_output * sorted_probs.unsqueeze(-1)
-        # Scatter back to original token positions
         output = torch.zeros_like(x_flat)
-        output.scatter_add_(0, sorted_token_ids.unsqueeze(-1).expand_as(sorted_output), sorted_output)
-        # Shared experts (for DeepSeekMoE) — unchanged
         for shared_expert_id in range(self.moe_shared_experts):
             w1, w2, w3 = self.shared_experts[shared_expert_id]
             expert_output = w2(F.gelu(w1(x_flat)) * w3(x_flat))
             output = output + expert_output
         return output
 class Block(nn.Module):
-    """Transformer block operating on unpadded (total_nnz, dim) tensors.
-    Receives unpadding metadata (cu_seqlens, max_seqlen, indices, attn_mask)
-    from the Transformer level and passes them to attention.  Norms and FFN
-    operate directly on the 2D unpadded tensor, avoiding wasted compute on
-    padding tokens.
-    """
     def __init__(self, config, layer_id: Optional[int] = None):
         super().__init__()
         self.is_first_block = (layer_id == 0)
@@ -447,53 +421,32 @@ class Block(nn.Module):
         else:
             self.ffn = FeedForward(config)
-        self.norm_attention = LayerNormClass(config.num_dims, eps=config.layernorm_eps)
-        self.norm_ffn = LayerNormClass(config.num_dims, eps=config.layernorm_eps)
-    def forward(self, x, cu_seqlens, max_seqlen, indices, attn_mask):
-        """
-        Args:
-            x: (total_nnz, dim) - unpadded hidden states
-            cu_seqlens: (batch + 1,)
-            max_seqlen: int
-            indices: (total_nnz,)
-            attn_mask: (batch, seq_len)
-        Returns:
-            x: (total_nnz, dim)
-            aux_loss: auxiliary loss from MoE or None
-        """
         if self.is_first_block:
             attn_in = x
         else:
             attn_in = self.norm_attention(x)
         x = x + self.attention(
-            attn_in,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-            indices=indices,
-            attn_mask=attn_mask,
-        )
         ffn_out, aux_loss = self.ffn(
             self.norm_ffn(x)
-        )
         x = x + ffn_out
         return x, aux_loss
-# ==============================================================================
-# Core Transformer (nn.Module backbone used inside HF wrappers)
-# ==============================================================================
-class Transformer(nn.Module):
-    """ModernBERT-style Transformer: unpad once before embeddings, repad once at
-    the end.  All blocks, norms, and FFNs operate on (total_nnz, dim) tensors,
-    avoiding wasted compute on padding tokens.
-    """
-    def __init__(self, config):
         super().__init__()
         self.vocab_size = config.vocab_size
         self.num_dims = config.num_dims
@@ -503,112 +456,79 @@ class Transformer(nn.Module):
         self.use_lossfreebalance = config.use_lossfreebalance and self.use_moe
         self.num_layers = config.num_layers
         hidden_dim = 4 * config.num_dims
-        self.tokens_embedding = nn.Embedding(config.vocab_size, config.num_dims)
-        self.norm_embeddings = LayerNormClass(config.num_dims, eps=config.layernorm_eps)
         self.blocks = nn.ModuleList()
         for layer_id in range(self.num_layers):
             self.blocks.append(Block(config, layer_id=layer_id))
-        self.norm = LayerNormClass(config.num_dims, eps=config.layernorm_eps)
-        self.ll_head = nn.Linear(config.num_dims, config.vocab_size, bias=False)
         self.tokens_embedding.weight = self.ll_head.weight
-    def _unpad(self, input_ids, attention_mask):
-        """Compute unpadding metadata and unpad input_ids before embedding.
-        Unpads input_ids (cheap 1D integer indexing) so that embedding and
-        all subsequent layers only process real tokens.
-        Args:
-            input_ids: (batch, seq_len)
-            attention_mask: (batch, seq_len) or None
-        Returns:
-            input_ids_unpadded: (total_nnz,)
-            indices: (total_nnz,)
-            cu_seqlens: (batch + 1,)
-            max_seqlen: int
-            attn_mask: (batch, seq_len)
-            batch_size: int
-            seq_len: int
-        """
-        batch_size, seq_len = input_ids.shape
-        if attention_mask is None:
-            attn_mask = torch.ones((batch_size, seq_len), device=input_ids.device, dtype=torch.int32)
-        else:
-            attn_mask = attention_mask.to(dtype=torch.int32)
-        # Unpad input_ids using the same bert_padding logic but on (batch, seq_len, 1)
-        # so we can reuse unpad_input which expects 3D
-        input_ids_3d = input_ids.unsqueeze(-1).float()  # (batch, seq_len, 1)
-        input_ids_unpadded, indices, cu_seqlens, max_seqlen = bert_padding.unpad_input(input_ids_3d, attn_mask)
-        input_ids_unpadded = input_ids_unpadded.squeeze(-1).long()  # (total_nnz,)
-        return input_ids_unpadded, indices, cu_seqlens, max_seqlen, attn_mask, batch_size, seq_len
-    def forward(
-        self,
-        x: torch.Tensor,
-        targets: Optional[torch.Tensor] = None,
-        start_pos: int = 0,
-        attention_mask: Optional[torch.Tensor] = None,
-    ):
-        batch_size, seq_len = x.shape
-        # Unpad input_ids before embedding — only embed real tokens
-        x_unpadded, indices, cu_seqlens, max_seqlen, attn_mask, batch_size, seq_len = self._unpad(x, attention_mask)
-        # Embed only real tokens (total_nnz, dim)
-        x = self.tokens_embedding(x_unpadded)
         x = self.norm_embeddings(x)
         total_aux_loss = 0
         for block in self.blocks:
-            x, aux_loss = block(
-                x,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=max_seqlen,
-                indices=indices,
-                attn_mask=attn_mask,
-            )
             if self.use_moe and not self.use_lossfreebalance:
                 total_aux_loss += aux_loss
         x = self.norm(x)
-        # Repad once — back to (batch, seq_len, dim) for the LM head / loss
-        x = bert_padding.pad_input(x, indices, batch_size, seq_len)
         logits = self.ll_head(x)
         if targets is None:
             loss = None
             ce_loss = None
         else:
             c_batch_size, c_context_len, c_dim = logits.shape
-            logits = logits.view(c_batch_size * c_context_len, c_dim)
-            targets = targets.view(c_batch_size * c_context_len)
             ce_loss = F.cross_entropy(logits, targets)
-            if self.use_moe and not self.use_lossfreebalance:
-                loss = ce_loss + total_aux_loss
-            else:
                 loss = ce_loss
                 ce_loss = aux_loss
         return logits, loss, ce_loss
     @torch.no_grad()
-    def generate(self, x: torch.Tensor, max_tokens: int, temperature: float = 1.0, top_k: int = 50,
                  use_cache: bool = False):
-        """Generate text from x up to max_tokens."""
         for c_tkn_pos in range(max_tokens):
             if use_cache:
                 if c_tkn_pos == 0:
@@ -629,265 +549,48 @@ class Transformer(nn.Module):
         return x
-# ==============================================================================
-# HuggingFace PreTrainedModel Wrappers
-# ==============================================================================
-class CustomTransformerPreTrainedModel(PreTrainedModel):
-    """Base class for CustomTransformer models."""
-    config_class = CustomTransformerConfig
-    base_model_prefix = "transformer"
-    supports_gradient_checkpointing = False
-    _no_split_modules = ["Block"]
-    def _init_weights(self, module):
-        """Initialize weights - handled by model itself."""
-        pass
-class CustomTransformerModel(CustomTransformerPreTrainedModel):
-    """The bare CustomTransformer Model outputting raw hidden-states."""
-    def __init__(self, config: CustomTransformerConfig):
-        super().__init__(config)
-        self.config = config
-        self.transformer = Transformer(config)
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.transformer.tokens_embedding
-    def set_input_embeddings(self, value):
-        self.transformer.tokens_embedding = value
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        """Forward pass returning raw hidden states."""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # Unpad input_ids before embedding
-        x_unpadded, indices, cu_seqlens, max_seqlen, attn_mask, batch_size, seq_len = self.transformer._unpad(input_ids, attention_mask)
-        # Embed only real tokens
-        x = self.transformer.tokens_embedding(x_unpadded)
-        x = self.transformer.norm_embeddings(x)
-        for block in self.transformer.blocks:
-            x, _ = block(x, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, indices=indices, attn_mask=attn_mask)
-        x = self.transformer.norm(x)
-        # Repad once
-        hidden_states = bert_padding.pad_input(x, indices, batch_size, seq_len)
-        if not return_dict:
-            return (hidden_states,)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=None,
-            hidden_states=None,
-            attentions=None,
-        )
-class CustomTransformerForMaskedLM(CustomTransformerPreTrainedModel):
-    """CustomTransformer Model with a masked language modeling head on top."""
-    _tied_weights_keys = ["transformer.ll_head.weight", "transformer.tokens_embedding.weight"]
-    def __init__(self, config: CustomTransformerConfig):
-        super().__init__(config)
-        self.config = config
-        self.transformer = Transformer(config)
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.transformer.tokens_embedding
-    def set_input_embeddings(self, value):
-        self.transformer.tokens_embedding = value
-    def get_output_embeddings(self):
-        return self.transformer.ll_head
-    def set_output_embeddings(self, new_embeddings):
-        self.transformer.ll_head = new_embeddings
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, MaskedLMOutput]:
-        """Forward pass for masked language modeling."""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        logits, model_loss, ce_loss = self.transformer(
-            input_ids, targets=labels, start_pos=0, attention_mask=attention_mask
-        )
-        masked_lm_loss = None
-        if labels is not None:
-            masked_lm_loss = model_loss
-        if not return_dict:
-            output = (logits,)
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=logits,
-            hidden_states=None,
-            attentions=None,
-        )
-class CustomTransformerForSequenceClassification(CustomTransformerPreTrainedModel):
-    """CustomTransformer Model with a sequence classification head on top."""
-    def __init__(self, config: CustomTransformerConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        self.transformer = Transformer(config)
-        # Classification head
-        classifier_dropout = (
-            config.classifier_dropout
-            if config.classifier_dropout is not None
-            else config.attention_probs_dropout_prob
-        )
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.num_dims, config.num_labels)
-        self._init_classifier_weights()
-        self.post_init()
-    def _init_classifier_weights(self):
-        std = 0.02
-        if isinstance(self.classifier, nn.Linear):
-            self.classifier.weight.data.normal_(mean=0.0, std=std)
-            if self.classifier.bias is not None:
-                self.classifier.bias.data.zero_()
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutput]:
-        """Forward pass for sequence classification."""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        # Unpad input_ids before embedding
-        x_unpadded, indices, cu_seqlens, max_seqlen, attn_mask, batch_size, seq_len = self.transformer._unpad(input_ids, attention_mask)
-        # Embed only real tokens
-        x = self.transformer.tokens_embedding(x_unpadded)
-        x = self.transformer.norm_embeddings(x)
-        # Collect hidden states if requested (repad each for the output tuple)
-        all_hidden_states = () if output_hidden_states else None
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (bert_padding.pad_input(x, indices, batch_size, seq_len),)
-        for block in self.transformer.blocks:
-            x, _ = block(x, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, indices=indices, attn_mask=attn_mask)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (bert_padding.pad_input(x, indices, batch_size, seq_len),)
-        x = self.transformer.norm(x)
-        # Repad once
-        hidden_states = bert_padding.pad_input(x, indices, batch_size, seq_len)
-        # Use [CLS] token representation (first token) for classification
-        pooled_output = hidden_states[:, 0, :]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = nn.MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = nn.CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = nn.BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits,) + (all_hidden_states,) + (None,)
-            return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=all_hidden_states,
-            attentions=None,
-        )
-# ==============================================================================
-# Auto-registration
-# ==============================================================================
-try:
-    from transformers import AutoConfig, AutoModel, AutoModelForMaskedLM, AutoModelForSequenceClassification
-    AutoConfig.register("custom_transformer", CustomTransformerConfig)
-    AutoModel.register(CustomTransformerConfig, CustomTransformerModel)
-    AutoModelForMaskedLM.register(CustomTransformerConfig, CustomTransformerForMaskedLM)
-    AutoModelForSequenceClassification.register(CustomTransformerConfig, CustomTransformerForSequenceClassification)
-except Exception:
-    pass
 def main():
     pass

 import torch._dynamo
 torch._dynamo.config.capture_scalar_outputs = True
+import numpy as np
 import torch.nn as nn
 import torch.nn.functional as F
+import random
+import time
+import math
+import inspect
+import os
 from dataclasses import dataclass
+from huggingface_hub import PyTorchModelHubMixin
+from typing import Optional
 import bert_padding
 from attention import FlexBertUnpadRopeAttention
 from torch.nn.parallel import DistributedDataParallel as DDP
 import torch.distributed as dist
 @dataclass
 class ModelConfig:
     vocab_size: int
+    num_dims: int                       # number of dimensions
+    num_heads: int                      # number of query heads
+    num_kv_heads: int                   # number of key/value heads
+    num_layers: int                     # total transformer layers
+    ffn_hidden_dims: int                # hidden dimension for FFN/FFNwMoE
+    context_len: int                    # maximum context length
+    use_cache: bool                     # enable KV-caching
+    use_flash: bool                     # use Flash Attention
+    use_moe: bool                       # enable mixture-of-experts
+    moe_num_experts: int                # total number of experts
+    moe_routed_experts: int             # number of experts per token (top_k)
+    moe_eps: float = 1e-6               # epsilon for router stability
+    moe_aux_loss_coef: float = 0.01     # coefficient for auxiliary loss
+    moe_shared_experts: int = 0         # number of shared experts (DeepSeekMoE)
+    use_lossfreebalance: bool = False   # use Auxiliary-loss-free load balancing strategy for mixture-of-experts from DeepSeek https://arxiv.org/pdf/2408.15664
     layernorm_eps: float = 1e-6
     rope_theta: float = 1e5
     num_attention_heads: Optional[int] = None
     embedding_size: Optional[int] = None
+    ffn_dim_multiplier: Optional[int] = None    # optional multiplier to compute ffn_hidden_dims
     def __post_init__(self):
         if self.hidden_size is None:
             self.use_fa2 = self.use_flash
+# Helper function for RoPE
+def repeat_kv(vct: torch.Tensor, n_times: int):
+    c_batch_size, c_context_len, num_kv_heads, c_dim = vct.shape
+    if n_times == 1:
+        return vct
+    else:
+        return (
+            vct[:, :, :, None, :]
+            .expand(c_batch_size, c_context_len, num_kv_heads, n_times, c_dim)
+            .reshape(c_batch_size, c_context_len, num_kv_heads * n_times, c_dim)
+        )
+class Rotary(nn.Module):
+    def __init__(self, config):
+        super(Rotary, self).__init__()
+        inv_freq = 1.0 / (config.rope_theta ** (torch.arange(0, config.num_dims // config.num_heads, 2).float() / (config.num_dims // config.num_heads)))
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+        self.seq_len_saved = None
+        self.cos_saved = None
+        self.sin_saved = None
+    def forward(self, x, seq_dim=1):
+        seq_len = x.size(seq_dim)
+        # Only recompute the cosine and sine matrices if the sequence length has changed.
+        if seq_len != self.seq_len_saved:
+            self.seq_len_saved = seq_len
+            pos = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
+            # Compute the outer product between positions and inverse frequencies.
+            freqs = torch.einsum("i,j->ij", pos, self.inv_freq) # (seq_len, inv_freq.shape[0])
+            # Duplicate the freqs along the last dimension to create pairs.
+            emb = torch.cat((freqs, freqs), dim=-1)
+            self.cos_saved = emb.cos()
+            self.sin_saved = emb.sin()
+        return self.cos_saved, self.sin_saved
+class Layernorm(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.g = nn.Parameter(torch.ones(config.num_dims))
+        self.eps = config.layernorm_eps
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        return self.g * self._norm(x.float()).type_as(x)
+class GroupedQueryAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.use_cache = config.use_cache
+        self.use_flash = config.use_flash
+        self.num_heads = config.num_heads
+        self.num_kv_heads = config.num_heads if config.num_kv_heads is None else config.num_kv_heads
+        self.num_rep = self.num_heads // self.num_kv_heads
+        self.head_dim = config.num_dims // self.num_heads
+        self.wq = nn.Linear(config.num_dims, config.num_dims, bias=False)
+        nn.init.normal_(self.wq.weight, mean=0, std=1/math.sqrt(config.num_dims))
+        self.wk = nn.Linear(config.num_dims, self.num_kv_heads * self.head_dim, bias=False)
+        nn.init.normal_(self.wk.weight, mean=0, std=1/math.sqrt(config.num_dims))
+        self.wv = nn.Linear(config.num_dims, self.num_kv_heads * self.head_dim, bias=False)
+        nn.init.normal_(self.wv.weight, mean=0, std=1/math.sqrt(config.num_dims))
+        self.wo = nn.Linear(config.num_dims, config.num_dims, bias=False)
+        self.cache_k = None
+        self.cache_v = None
+    def rotate_half(self, x):
+        half = x.shape[-1] // 2
+        first_half, second_half  = x[..., :half], x[..., half:]
+        return torch.cat([-second_half, first_half], dim=-1)
+    def apply_rotary_pos(self, q, k, cos, sin):
+        q_rot = q * cos + self.rotate_half(q) * sin
+        k_rot = k * cos + self.rotate_half(k) * sin
+        return q_rot, k_rot
+    def update_kv_cache(self, batch_size, start_pos, context_len, keys, values, device):
+        # Initialize cache if not exist
+        if self.cache_k is None:
+            self.cache_k = torch.zeros(
+                (batch_size, self.config.context_len, self.num_kv_heads, self.head_dim),
+                device=device
+            )
+            self.cache_v = torch.zeros(
+                (batch_size, self.config.context_len, self.num_kv_heads, self.head_dim),
+                device=device
+            )
+        # Update cache
+        self.cache_k[:batch_size, start_pos:start_pos + context_len] = keys
+        self.cache_v[:batch_size, start_pos:start_pos + context_len] = values
+        return (self.cache_k[:batch_size, :start_pos + context_len],
+                self.cache_v[:batch_size, :start_pos + context_len])
+    def forward(self, x, cos, sin, start_pos = 0):
+        c_batch_size, c_context_len, c_dim = x.shape # c_context_len = 1
+        if self.use_cache and c_context_len == 1:
+            # Cache branch
+            q = self.wq(x[:, -1, :])
+            k = self.wk(x[:, -1, :])
+            v = self.wv(x[:, -1, :])
+            q = q.view(c_batch_size, c_context_len, self.num_heads, self.head_dim).transpose(1, 2)      # B, T, qh, hs
+            k = k.view(c_batch_size, c_context_len, self.num_kv_heads, self.head_dim).transpose(1, 2)   # B, T, kh, hs
+            v = v.view(c_batch_size, c_context_len, self.num_kv_heads, self.head_dim).transpose(1, 2)   # B, T, vh, hs
+            # freqs_complex = freqs_complex[-1:]
+            # queries = apply_rotary_pos(q, freqs_complex, device=x.device)
+            # keys = apply_rotary_pos(k, freqs_complex, device=x.device)
+            keys, v = self.update_kv_cache(batch_size=c_batch_size, start_pos=start_pos, context_len=c_context_len, keys=keys, values=v, device=x.device)
+            queries, keys = self.apply_rotary_pos(q, k, cos, sin)
+        else:
+            # Non-cache branch (process the entire sequence normally)
+            q = self.wq(x)
+            k = self.wk(x)
+            v = self.wv(x)
+            q = q.view(c_batch_size, c_context_len, self.num_heads, self.head_dim).transpose(1, 2)      # B, qh, T, hs
+            k = k.view(c_batch_size, c_context_len, self.num_kv_heads, self.head_dim).transpose(1, 2)   # B, kh, T, hs
+            v = v.view(c_batch_size, c_context_len, self.num_kv_heads, self.head_dim).transpose(1, 2)   # B, vh, T, hs
+            queries, keys = self.apply_rotary_pos(q, k, cos, sin)
+            # queries = apply_rotary_pos(q, freqs_complex, device=x.device)
+            # keys = apply_rotary_pos(k, freqs_complex, device=x.device)
+            if self.use_cache: _k, _v = self.update_kv_cache(batch_size=c_batch_size, start_pos=start_pos, context_len=c_context_len, keys=keys, values=v, device=x.device)
+        if self.use_flash:
+            output = F.scaled_dot_product_attention(queries, keys, v, is_causal=True, enable_gqa=True)
+        else: # Calculate Grouped Query Attention manually
+            keys = repeat_kv(keys, self.num_rep)
+            values = repeat_kv(v, self.num_rep)
+            attention = torch.matmul(queries, keys.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
+            if self.use_cache and x.shape[1] == 1:
+                total_length = keys.size(2)
+                # For autoregressive generation, the query (which is at the latest position) should only attend to keys at indices <= current token.
+                # Create a mask: allowed positions are indices < total_length (i.e. all in the cache)
+                mask = torch.arange(total_length, device=attention.device).unsqueeze(0) <= (start_pos + x.shape[1] - 1)
+                mask = mask.unsqueeze(0).unsqueeze(0)  # shape: (1, 1, 1, total_length)
+                attention = attention.masked_fill(~mask, float("-inf"))
+                attention = F.softmax(attention, dim=-1)
+                output = torch.matmul(attention, values)
+            else: # Do not use kv_cache
+                attention = torch.tril(attention[:, :, :c_context_len, :c_context_len])
+                attention = attention.masked_fill(attention == 0, float("-inf"))
+                attention = F.softmax(attention, dim=-1).type_as(queries)
+                output = torch.matmul(attention, values)
+        output = output.transpose(2, 1).contiguous().view(c_batch_size, c_context_len, c_dim)
+        return self.wo(output)
+class FlexBertUnpadAttention(nn.Module):
     def __init__(self, config, layer_id: Optional[int] = None):
         super().__init__()
         self.attn = FlexBertUnpadRopeAttention(config=config, layer_id=layer_id)
+    def forward(self, x: torch.Tensor):
+        batch_size, seq_len, _ = x.shape
+        attn_mask = torch.ones((batch_size, seq_len), device=x.device, dtype=torch.int32)
+        hidden_states, indices, cu_seqlens, max_seqlen = bert_padding.unpad_input(x, attn_mask)
+        attn_out = self.attn(
             hidden_states=hidden_states,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
             indices=indices,
             attn_mask=attn_mask,
         )
+        return bert_padding.pad_input(attn_out, indices, batch_size, seq_len)
 class FeedForward(nn.Module):
+    """
+    Default Feed Forward Layer.
+    """
     def __init__(self, config):
         super().__init__()
         self.w2 = nn.Linear(self.hidden_dim, config.num_dims, bias=False)
         self.w3 = nn.Linear(config.num_dims, self.hidden_dim, bias=False)
         self.act = nn.GELU()
     def forward(self, x: torch.Tensor):
         return self.w2(self.act(self.w1(x)) * self.w3(x)), None
+class FFNwMoE(nn.Module):
     """
     Feed Forward with MoE with optional shared experts.
     Returns after forward:
         output: Combined outputs from experts
         aux_loss: Auxiliary loss tensor or routing metadata
     """
+    def __init__(self, config: ModelConfig):
         super().__init__()
         self.hidden_dim = config.ffn_hidden_dims
+        self.moe_routed_experts = config.moe_routed_experts # top_k
         self.moe_aux_loss_coef = config.moe_aux_loss_coef
         self.moe_eps = config.moe_eps
         self.moe_shared_experts = config.moe_shared_experts
         self.num_experts = config.moe_num_experts
+        self.use_lossfreebalance = config.use_lossfreebalance
+        self.router = nn.Linear(config.num_dims, self.num_experts, bias=False)
+        self.experts = nn.ModuleList()
+        for _ in range(self.num_experts):
+            self.experts.append(
+                nn.ModuleList([
+                    nn.Linear(config.num_dims, self.hidden_dim, bias=False),
+                    nn.Linear(self.hidden_dim, config.num_dims, bias=False),
+                    nn.Linear(config.num_dims, self.hidden_dim, bias=False)
+                ]))
         # shared experts (for DeepSeekMoE)
         self.shared_experts = nn.ModuleList()
         for _ in range(self.moe_shared_experts):
                     nn.Linear(self.hidden_dim, config.num_dims, bias=False),
                     nn.Linear(config.num_dims, self.hidden_dim, bias=False)
                 ]))
+        # Auxiliary-loss-free load balancing strategy for mixture-of-experts from DeepSeek https://arxiv.org/pdf/2408.15664
         if self.use_lossfreebalance:
             self.expert_biases = nn.Parameter(torch.zeros(self.num_experts))
     def forward(self, x: torch.Tensor):
+        c_batch_size, c_context_len, c_dim = x.shape
+        x_flat = x.view(-1, c_dim)          #c_batch_size * c_context_len, c_dim
         router_out = self.router(x_flat)
+        router_probs = F.softmax(router_out, dim=-1)
         _, topk_indices = router_out.topk(self.moe_routed_experts, dim=-1)
         self.last_topk_indices = topk_indices.detach()
         output = self._compute_expert_outputs(x_flat, topk_indices, topk_probs, router_probs)
+        return output.view(c_batch_size, c_context_len, c_dim), aux_loss
     def _compute_aux_loss(self, router_out, router_probs, topk_indices):
+        """
+        Computes the auxiliary loss based on whether loss-free balancing is used or not.
+        """
         if not self.use_lossfreebalance:
             topk_probs, _ = router_probs.topk(self.moe_routed_experts, dim=-1)
             expert_mask = F.one_hot(topk_indices[:, 0], self.num_experts).float()
             router_prob_mean = router_probs.mean(dim=0)
             aux_loss = self.moe_aux_loss_coef * torch.sum(density * router_prob_mean) * self.num_experts
+        else: # if use_lossfreebalance
             router_out = router_out + self.expert_biases
+            router_probs = torch.sigmoid(router_out) # from https://arxiv.org/pdf/2408.15664 paper
             topk_probs = router_probs.gather(-1, topk_indices)
             topk_probs = topk_probs / topk_probs.sum(dim=-1, keepdim=True)
+            # In the case of Auxiliary-loss-free load balancing we pass router_probs, topk_indices as aux_loss for further calculations
             aux_loss = (router_probs, topk_indices)
         return aux_loss, topk_probs
     def _compute_expert_outputs(self, x_flat, topk_indices, topk_probs, router_probs):
         """
+        Compute the output of the experts and shared experts if needed
+        """
         output = torch.zeros_like(x_flat)
+        for i in range(self.moe_routed_experts):
+            expert_index = topk_indices[:, i]
+            expert_probs = topk_probs[:, i]
+            for expert_id in range(self.num_experts):
+                idx = (expert_id == expert_index).nonzero().squeeze()
+                if idx.numel() == 0:
+                    continue
+                x_for_expert = x_flat[idx]
+                w1, w2, w3 = self.experts[expert_id]
+                expert_output = w2(F.gelu(w1(x_for_expert)) * w3(x_for_expert))
+                output[idx] += expert_output * expert_probs[idx].unsqueeze(-1)
+        # shared experts(for DeepSeekMoE)
         for shared_expert_id in range(self.moe_shared_experts):
             w1, w2, w3 = self.shared_experts[shared_expert_id]
             expert_output = w2(F.gelu(w1(x_flat)) * w3(x_flat))
             output = output + expert_output
         return output
 class Block(nn.Module):
     def __init__(self, config, layer_id: Optional[int] = None):
         super().__init__()
         self.is_first_block = (layer_id == 0)
         else:
             self.ffn = FeedForward(config)
+        self.norm_attention = nn.LayerNorm(config.num_dims, eps=config.layernorm_eps)
+        self.norm_ffn = nn.LayerNorm(config.num_dims, eps=config.layernorm_eps)
+    def forward(self, x, start_pos):
+        _ = start_pos
         if self.is_first_block:
             attn_in = x
         else:
             attn_in = self.norm_attention(x)
         x = x + self.attention(
+            attn_in
+            )
         ffn_out, aux_loss = self.ffn(
             self.norm_ffn(x)
+            )
         x = x + ffn_out
         return x, aux_loss
+class Transformer(nn.Module, PyTorchModelHubMixin): # extending PyTorchModelHubMixin for save weights as safetensors
+    def __init__(self, config: ModelConfig):
         super().__init__()
+        if isinstance(config, dict):
+            config = ModelConfig(**config)
         self.vocab_size = config.vocab_size
         self.num_dims = config.num_dims
         self.use_lossfreebalance = config.use_lossfreebalance and self.use_moe
         self.num_layers = config.num_layers
+        # Calculation of hidden_dim for FFN/FFNwMoE
+        # multiple_of = 4
+        # ffn_dim_multiplier = config.ffn_dim_multiplier
         hidden_dim = 4 * config.num_dims
+        # hidden_dim = int(2 * config.num_dims / 3)
+        # if ffn_dim_multiplier is not None:
+        #     hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        # config.ffn_hidden_dims = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.tokens_embedding = nn.Embedding(self.vocab_size, self.num_dims)
+        self.norm_embeddings = nn.LayerNorm(config.num_dims, eps=config.layernorm_eps)
         self.blocks = nn.ModuleList()
         for layer_id in range(self.num_layers):
             self.blocks.append(Block(config, layer_id=layer_id))
+        self.norm = nn.LayerNorm(config.num_dims, eps=config.layernorm_eps)
+        self.ll_head = nn.Linear(self.num_dims, self.vocab_size, bias=False)
         self.tokens_embedding.weight = self.ll_head.weight
+        # torch.nn.init.normal_(self.ll_head.weight, mean=0.0, std=0.02)
+        # torch.nn.init.normal_(self.tokens_embedding.weight, mean=0.0, std=0.02)
+        # self.freqs_complex = None # precompute_theta_pos_frequencies(self.num_dims // self.num_heads, self.context_len * 2, device=config.device)
+    def forward(self, x: torch.Tensor, targets: Optional[torch.Tensor] = None, start_pos: int = 0):
+        x = self.tokens_embedding(x)
         x = self.norm_embeddings(x)
+        # if self.freqs_complex == None:
+        #     self.freqs_complex = precompute_theta_pos_frequencies(self.num_dims // self.num_heads, self.context_len * 2, device=x.device)
+        # freqs_complex = self.freqs_complex[start_pos:start_pos + seq_len]
         total_aux_loss = 0
         for block in self.blocks:
+            x, aux_loss = block(x, start_pos=start_pos)
             if self.use_moe and not self.use_lossfreebalance:
                 total_aux_loss += aux_loss
         x = self.norm(x)
         logits = self.ll_head(x)
         if targets is None:
             loss = None
             ce_loss = None
         else:
             c_batch_size, c_context_len, c_dim = logits.shape
+            logits = logits.view(c_batch_size*c_context_len, c_dim)
+            targets = targets.view(c_batch_size*c_context_len)
             ce_loss = F.cross_entropy(logits, targets)
+            if self.use_moe and not self.use_lossfreebalance: loss = ce_loss + total_aux_loss    # in this case, ce_loss its loss w/o aux_loss
+            else: # if we want to use Auxiliary-loss-free load balancing we pass router_probs, topk_indices as ce_loss
+                # Also, work when moe is not used
                 loss = ce_loss
                 ce_loss = aux_loss
         return logits, loss, ce_loss
     @torch.no_grad()
+    def generate(self, x: torch.Tensor, max_tokens: int, temperature: float = 1.0, top_k: int = 50,
                  use_cache: bool = False):
+        """
+        Generate text from x up to max_tokens
+        """
         for c_tkn_pos in range(max_tokens):
             if use_cache:
                 if c_tkn_pos == 0:
         return x
 def main():
+    # config = ModelConfig(
+    #     device = 'cuda' if torch.cuda.is_available() else 'cpu',
+    #     vocab_size = 50304,
+    #     num_dims = 1024,
+    #     num_heads = 16,
+    #     num_kv_heads = 4,
+    #     num_layers = 16,
+    #     ffn_hidden_dims = 1024 * 4,
+    #     layernorm_eps = 1e-6,
+    #     rope_theta = 1e5,
+    #     context_len = 1024,
+    #     use_cache = False,
+    #     use_flash = False,
+    #     use_moe = False,
+    #     moe_num_experts = 6,
+    #     moe_routed_experts = 1,
+    #     moe_eps = 1e-6,
+    #     moe_aux_loss_coef = 0.01,
+    #     moe_shared_experts = 0,
+    #     use_lossfreebalance = False,
+    # )
+    # device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # SEED = 1337
+    # torch.manual_seed(SEED)
+    # if device == 'cuda':
+    #     torch.cuda.manual_seed(SEED)
+    # model = Transformer(config)
+    # model = model.to(device)
+    # model = torch.compile(model)
+    # print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')
     pass