File size: 29,955 Bytes

943bd92

import math
import inspect
from dataclasses import dataclass
from contextlib import nullcontext

import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Tuple
import inspect

from transformers.modeling_outputs import CausalLMOutput
from manager import MANAGER

torch.manual_seed(101)

def precompute_freqs_cis(config):
    # We now return cos and sin directly instead of a complex polar tensor
    freqs = 1.0 / (config.theta ** (torch.arange(0, config.d_rotate, 2)[: (config.d_rotate // 2)].float() / config.d_rotate))
    t = torch.arange(config.block_size, device=freqs.device)
    freqs = torch.outer(t, freqs).float() # [seq_len, d_rotate/2]
    
    # Cos and Sin are what Inductor can easily optimize
    cos = torch.cos(freqs)
    sin = torch.sin(freqs)
    
    # Repeat along the last dimension to match the d_rotate size
    # [seq_len, d_rotate/2] -> [seq_len, d_rotate]
    cos = torch.repeat_interleave(cos, 2, dim=-1)
    sin = torch.repeat_interleave(sin, 2, dim=-1)
    return cos, sin

def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    # x: [..., d_rotate]
    # Split into [x1, x2, x3, x4...] -> x1, x2 are pairs
    # We use the interleaving pattern: [-x2, x1, -x4, x3...]
    x1 = x[..., 0::2]
    x2 = x[..., 1::2]
    return torch.stack((-x2, x1), dim=-1).flatten(-2)

def apply_rotary_emb(xq, xk, freqs_cos, freqs_sin):
    # Reshape freqs for broadcasting: [seq_len, d_rotate] -> [1, seq_len, 1, d_rotate]
    # This matches (batch, seq, head, dim)
    cos = freqs_cos[:xq.shape[1]].view(1, xq.shape[1], 1, xq.shape[-1])
    sin = freqs_sin[:xq.shape[1]].view(1, xq.shape[1], 1, xq.shape[-1])

    # The RoPE formula: x_out = x * cos + rotate_half(x) * sin
    xq_out = (xq * cos) + (rotate_half(xq) * sin)
    xk_out = (xk * cos) + (rotate_half(xk) * sin)

    return xq_out.type_as(xq), xk_out.type_as(xk)

class MultiHeadLatentAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.d_model = config.n_embd
        self.num_head = config.n_head
        self.d_head = self.d_model // self.num_head
        
        self.d_c = config.d_c
        self.d_c1 = config.d_c1
        self.d_rotate = config.d_rotate
        
        # ==========================================
        # FUSION 1: All Projections from 'x'
        # Replaces DQ_proj, DKV_proj, and RK_proj
        # ==========================================
        self.W_down = nn.Linear(
            self.d_model, 
            self.d_c1 + self.d_c + self.d_rotate, 
            bias=config.bias
        )
        self.W_down.is_attention = True
        
        # ==========================================
        # FUSION 2: All Q Up-Projections from 'C_Q'
        # Replaces UQ_proj and RQ_proj
        # ==========================================
        self.W_up_q = nn.Linear(
            self.d_c1, 
            self.d_model + (self.num_head * self.d_rotate), 
            bias=config.bias
        )
        self.W_up_q.is_attention = True
        
        # ==========================================
        # FUSION 3: All KV Up-Projections from 'C_KV'
        # Replaces UK_proj and UV_proj (STILL STRICTLY SEPARATE WEIGHTS)
        # ==========================================
        self.W_up_kv = nn.Linear(
            self.d_c, 
            self.d_model + self.d_model, # d_model for K, d_model for V
            bias=config.bias
        )
        self.W_up_kv.is_attention = True

        self.q_norm = nn.RMSNorm(self.d_c1)
        self.kv_norm = nn.RMSNorm(self.d_c)

        # Output projection and Regularization
        self.output_proj = nn.Linear(self.d_model, self.d_model, bias=config.bias)
        self.output_proj.output_proj_marker = True
        self.output_proj.is_attention = True
        
        self.dropout = nn.Dropout(config.dropout)
        self.attn_dropout_p = config.dropout

        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        cos, sin = precompute_freqs_cis(config)
        self.register_buffer("freqs_cos", cos, persistent=False)
        self.register_buffer("freqs_sin", sin, persistent=False)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()

        # ---------------------------------------------------------
        # 1. KERNEL 1: Down-project everything at once
        # ---------------------------------------------------------
        down_out = self.W_down(x)
        # Split into the 3 exact latents your math requires
        C_Q, C_KV, K_rotate = down_out.split(
            [self.d_c1, self.d_c, self.d_rotate], dim=-1
        )

        C_Q = self.q_norm(C_Q)
        C_KV = self.kv_norm(C_KV)

        # ---------------------------------------------------------
        # 2. KERNEL 2: Up-project Query content and RoPE
        # ---------------------------------------------------------
        q_up_out = self.W_up_q(C_Q)
        Q_state, Q_rotate = q_up_out.split(
            [self.d_model, self.num_head * self.d_rotate], dim=-1
        )
        Q_state = Q_state.view(batch_size, seq_len, self.num_head, self.d_head)
        Q_rotate = Q_rotate.view(batch_size, seq_len, self.num_head, self.d_rotate)

        # ---------------------------------------------------------
        # 3. KERNEL 3: Up-project Key and Value content independently
        # ---------------------------------------------------------
        kv_up_out = self.W_up_kv(C_KV)
        K_state, V_state = kv_up_out.split(
            [self.d_model, self.d_model], dim=-1
        )
        K_state = K_state.view(batch_size, seq_len, self.num_head, self.d_head)
        V_state = V_state.view(batch_size, seq_len, self.num_head, self.d_head)
        
        # Prepare shared RoPE Key
        K_rotate = K_rotate.view(batch_size, seq_len, 1, self.d_rotate).expand(-1, -1, self.num_head, -1)

        # ---------------------------------------------------------
        # 4. Apply RoPE, Concatenate, and Attention
        # ---------------------------------------------------------
        Q_rotate, K_rotate = apply_rotary_emb(
            Q_rotate, 
            K_rotate, 
            self.freqs_cos, 
            self.freqs_sin
        )

        Q = torch.cat([Q_state, Q_rotate], dim=-1).transpose(1, 2)
        K = torch.cat([K_state, K_rotate], dim=-1).transpose(1, 2)
        V = V_state.transpose(1, 2)

        if self.flash:
            att_output = F.scaled_dot_product_attention(
                Q, K, V, 
                dropout_p=self.attn_dropout_p if self.training else 0.0, 
                is_causal=True
            )
        else:
            scaler = 1.0 / math.sqrt(self.d_head + self.d_rotate)
            att_matrix = (Q @ K.transpose(-2, -1)) * scaler
            mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device)).view(1, 1, seq_len, seq_len)
            att_matrix = att_matrix.masked_fill(mask == 0, float('-inf'))
            att_score = self.dropout(F.softmax(att_matrix, dim=-1))
            att_output = att_score @ V

        att_output = att_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)

        return self.output_proj(att_output)

class Router(nn.Module):
    def __init__(self, config):
        super().__init__()

        # router settings
        self.top_k = config.top_k
        self.n_exp = config.n_exp
        assert self.top_k >= 1 and self.top_k <= config.n_exp
        self.use_noisy_top_k = config.use_noisy_top_k
        self.train_capacity = config.train_capacity
        self.eval_capacity = config.eval_capacity
        self.min_capacity = config.min_capacity
        self.router_use_full_prec = config.router_use_full_prec

        # auxiliary / load balancing loss settings
        self.use_aux_loss = config.use_aux_loss
        self.use_router_z_loss = config.use_router_z_loss

        # linear projection for (noisy) softmax gating
        # no bias is used, see page 4 eq (4) in (https://arxiv.org/abs/1701.06538)
        self.w_g = nn.Linear(config.n_embd, config.n_exp, bias=False)
        self.w_g.router_marker = True
        self.w_noise = nn.Linear(config.n_embd, config.n_exp, bias=False) if self.use_noisy_top_k else None
    
    def forward(self, x):
        # optionally run the router in full precision to avoid instability during training
        # see discussion on pg. 9 here: https://arxiv.org/abs/2101.03961
        # setting enabled to False in autocast automatically puts everything in float32
        device_type = 'cuda' if torch.cuda.is_available() else 'cpu' # for later use in torch.autocast
        ctx = nullcontext() if not self.router_use_full_prec else torch.amp.autocast(device_type=device_type, enabled=False)

        with ctx:
            B, T, _ = x.size()
            num_tokens = B * T

            # eq (4) in (https://arxiv.org/abs/1701.06538)
            logits = self.w_g(x)  # [B, T, n_exp]
            if self.use_noisy_top_k:
                # optionally add noise into the router
                noise = F.softplus(self.w_noise(x))
                noise *= torch.randn_like(noise)
                logits += noise

            # router z loss, computed on logits (before softmax)
            # this loss prevents router logits from becoming too large
            if self.use_router_z_loss:
                z_loss = self.compute_router_z_loss(logits)
                MANAGER.add_router_z_loss(z_loss)

            # find top k experts for each token
            top_k_logits, top_k_indices = logits.topk(self.top_k, dim=-1) # [B, T, k]

            # normalize expert probabilities
            # Question: should we normalize over all experts or just top-k?
            # we choose to normalize over top-k, other option is commented out below

            # Shazeer et al (https://arxiv.org/abs/1701.06538) does only topk
            # see page 4 eq (3)-(5), the code for this is commented out below
            router_probs = torch.full_like(logits, float('-inf'))  # [B, T, n_exp]
            router_probs.scatter_(-1, top_k_indices, top_k_logits)
            router_probs = F.softmax(router_probs, dim=-1)

            # # normalize all router logits (not just top-k) via softmax      
            router_probs = F.softmax(logits, dim=-1)

            # compute auxiliary load balancing loss
            # this loss encourages equal probability assigned to each expert
            # and equal load balancing of tokens assigned to each expert
            if self.use_aux_loss:
                aux_loss = self.compute_aux_loss(router_probs, top_k_indices)
                MANAGER.add_aux_loss(aux_loss)

            # compute expert capacity
            exp_capacity = self.get_capacity(num_tokens)

            # make a multi-hot mask of chosen experts, size [B, T, n_exp]
            # entries are 0 if expert not chosen and 1 if expert chosen
            exp_mask = F.one_hot(top_k_indices, num_classes=self.n_exp)  # [B, T, k, n_exp]
            exp_mask = exp_mask.view(num_tokens, self.top_k, self.n_exp)  # [B * T, k, n_exp]
            exp_mask = exp_mask.permute(1, 0, 2) # [k, B * T, n_exp]

            # compute cumulative sum of each token over experts, this stores
            # the index of each token within the batch of each expert
            # NOTE: cumsum should count all top-1 first, top-2 second, etc.
            # so that we prioritize top experts when dropping tokens (this is
            # done by putting k dimension first for the reshape operation)
            exp_rank = exp_mask.reshape(self.top_k * num_tokens, self.n_exp)  # [k * B * T, n_exp]
            exp_rank = torch.cumsum(exp_rank, dim=0) - 1  # cumulative sum of expert selections [k * B * T, n_exp]
            exp_rank = exp_rank.reshape(self.top_k, num_tokens, self.n_exp)  # [k, B * T, n_exp]

            # mask out (set to zero) entries that go beyond expert capacity
            # compute amount of used capacity by taking a sum over mask
            exp_mask *= torch.lt(exp_rank, exp_capacity) # [k, B * T, n_exp]
            used_capacity = torch.sum(exp_mask, dim=(0, 1)) # [n_exp]

            # mask rank to only include tokens that are selected
            # perform a sum so each row only contains index of token
            # for the expert that is selected in that row
            # result is a matrix that contains the position of each token
            # in the batch of its corresponding expert
            exp_rank = torch.sum(exp_mask * exp_rank, dim=-1)  # [k, B * T]

            # mask probabilities to only include selected experts
            router_probs = router_probs.view(num_tokens, self.n_exp)[None, :] # [1, B * T, n_exp]
            exp_weights = exp_mask * router_probs # [k, B * T, n_exp]

            # convert rank into one-hot vectors over the available capacity
            # stores the position of each token within the capacity of the selected expert
            exp_rank_sc = F.one_hot(exp_rank, num_classes=exp_capacity) # [k, B * T, exp_capacity]

            # create a vector that stores, for each token, the weight of selected
            # experts at token's position in the capacity of that expert
            # size of tensor is [B * T, n_exp, exp_capacity]
            cb_weight = torch.sum(exp_weights.unsqueeze(3) * exp_rank_sc.unsqueeze(2), dim=0)
            sec_mask = cb_weight.bool() # binary mask of selected experts for each token
            return used_capacity, cb_weight, sec_mask
    
    def compute_aux_loss(self, expert_probs: torch.Tensor, indices: torch.Tensor):
        """

        Computes Switch Transformer auxiliary loss (https://arxiv.org/abs/2101.03961)

        See equations (4)-(6) on page 7

        """

        # equation (5): compute ratio of tokens allocated to each expert
        # total number of tokens is defined as total tokens in batch * k
        # (k = 1) for the Switch Transformer
        with torch.no_grad():
            one_hot_indices = F.one_hot(indices, num_classes=self.n_exp)  # [B, T, k, n_exp]
            one_hot_indices = torch.sum(one_hot_indices.float(), dim=2)  # [B, T, n_exp] (sum over k dimension)
            tokens_per_expert = torch.mean(one_hot_indices.float(), dim=(0, 1))

        # equation (6): compute ratio of router probability allocated to each expert
        prob_per_expert = torch.mean(expert_probs.float(), dim=(0, 1))

        # equation (4): take a scaled dot product between prob/token allocation vectors
        # multiply the result by the number of experts
        return self.n_exp * torch.sum(prob_per_expert * tokens_per_expert)
    
    def compute_router_z_loss(self, logits: torch.Tensor):
        """

        Computes ST-MoE router z loss (https://arxiv.org/abs/2202.08906)

        See equation (5) on page 7

        """
    
        # exponentiate logits, sum logits of each expert, take log, and square
        # code below is the same as:
        # > z_loss = torch.exp(logits)
        # > z_loss = torch.sum(z_loss, dim=-1)
        # > z_loss = torch.log(z_loss) ** 2.0
        z_loss = torch.logsumexp(logits, dim=-1) ** 2.0  # [B, T, n_exp]

        # sum over all tokens and divide by total number of tokens
        return torch.mean(z_loss)

    def get_capacity(self, tokens_per_batch):
        # expert capacity is given by (tokens_per_batch / num_experts) * capacity_factor
        # see eq (3) in Switch Transformer (https://arxiv.org/abs/2101.03961)
        capacity_factor = self.train_capacity if self.training else self.eval_capacity
        capacity = math.floor(self.top_k * capacity_factor * tokens_per_batch / self.n_exp)
        capacity += capacity % 2 
        capacity = max(capacity, self.min_capacity)
        assert capacity > 0
        return int(capacity)

# FEEDFORWARD
class MLP(nn.Module):
    def __init__(self, config, ffn_dim=None):
        super().__init__()

        if ffn_dim==None:
            ffn_dim = config.ffn_dim
        
        self.fc1 = nn.Linear(config.n_embd, 2 * ffn_dim, bias=config.bias)
        self.fc1.is_swiglu = True
        self.swish = nn.SiLU() 
        self.fc2 = nn.Linear(ffn_dim, config.n_embd, bias=config.bias)
        self.fc2.output_proj_marker = True
        
        self.dropout1 = nn.Dropout(config.dropout)
        self.dropout2 = nn.Dropout(config.dropout)

        # nn.init.xavier_uniform_(self.fc1.weight, gain=math.sqrt(2.0))
        # nn.init.xavier_uniform_(self.fc2.weight, gain=1.0)

    def forward(self, x):
        x = self.fc1(x)

        # Inline SwiGLU: Split the doubled dimension and apply gate
        x, gate = x.chunk(2, dim=-1)
        x = x * self.swish(gate)
        
        x = self.dropout1(x)
        x = self.fc2(x)
        return self.dropout2(x)
    

class MLPExperts(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.n_exp = config.n_exp
        self.n_embd = config.n_embd
        self.bias = config.bias

        self.c_fc = nn.Parameter(torch.empty(self.n_exp, self.n_embd, 2 * config.expert_dim))
        self.c_proj = nn.Parameter(torch.empty(self.n_exp, config.expert_dim, self.n_embd))
        
        self.swish = nn.SiLU()
        self.dropout = nn.Dropout(config.dropout)
        
    def forward(self, x):
        x = torch.bmm(x, self.c_fc)

        x, gate = x.chunk(2, dim=-1)
        x = x * self.swish(gate)

        x = torch.bmm(x, self.c_proj)
        
        return self.dropout(x)

class MOELayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.router = Router(config) # (noisy) top k router
        self.experts = MLPExperts(config) # group of MLPs (experts)

        self.shared_expert = MLP(config, ffn_dim=config.shared_dim)

    def forward(self, x: torch.Tensor):
        B, T, n_embd = x.size() 
        num_tokens = (B * T)

        shared_out = self.shared_expert(x)

        used_capacity, exp_weight, exp_mask = self.router(x)

        x = x.view(num_tokens, n_embd)

        # [n_exp, exp_capacity, B * T] * [B * T, n_embd] -> [n_exp, exp_capacity, n_embd]
        exp_batches = exp_mask.permute(1, 2, 0).type_as(x) @ x

        exp_out = self.experts(exp_batches) # [n_exp, exp_capacity, n_embd]

        # aggregate expert outputs based on router weights
        # eq (2) on page 4 of ST-MoE (https://arxiv.org/abs/2202.08906)
        # similar equations are used for other MoE papers
        exp_weight = exp_weight.view(num_tokens, -1) # [B * T, n_exp * exp_capacity]
        exp_out = exp_out.view(-1, n_embd) # [n_exp * exp_capacity, n_embd] 
        output = exp_weight @ exp_out # [B * T, n_embd]

        moe_out = output.view(B, T, n_embd)

        return moe_out + shared_out

class Block(nn.Module):

    def __init__(self, config, use_moe=False):
        super().__init__()
        self.ln_1 = nn.RMSNorm(config.n_embd)
        self.attn = MultiHeadLatentAttention(config)
        self.ln_2 = nn.RMSNorm(config.n_embd)
        if use_moe:
            self.mlp = MOELayer(config)
        else:
            self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 2048
    vocab_size: int = 50304 
    n_layer: int = 24        
    n_head: int = 10       
    n_embd: int = 640        
    dropout: float = 0.0     
    ffn_dim: int = 640*4      
    bias: bool = False 

    # MLA - High Efficiency
    d_c: int = 192           
    d_c1: int = 192
    d_rotate: int = 64    
    theta: float = 10000.0

    # MoE - Maximally Smart
    n_exp: int = 12         
    top_k: int = 3
    expert_dim: int = 640
    shared_dim: int = 640
    stride: int = 2          
    
    # Stability (Standard Production Settings)
    use_aux_loss: bool = True
    use_router_z_loss: bool = True
    use_noisy_top_k: bool = True
    aux_loss_weight: float = 0.01
    router_z_loss_weight: float = 0.001
    train_capacity: float = 1.25
    eval_capacity: float = 2.0
    min_capacity: int = 4
    use_switch_tfm_init: bool = True
    switch_tfm_init_scale: float = 1.0
    router_use_full_prec: bool = True

    # Training Hyperparameters
    batch_size: int = 8    
    grad_acc: int = 128      
    num_train_epochs: int = 1 
    learning_rate: float = 3e-4 
    weight_decay: float = 0.1
    betas: tuple = (0.9, 0.95)
    warm_up: int = 5000      
    
    eos_token_id = 0
    bos_token_id = 0
    pad_token_id = 0

class HybridOptimizer(torch.optim.Optimizer):
        def __init__(self, optimizers):
            self.optimizers = optimizers
            self.param_groups = []
            for opt in self.optimizers:
                self.param_groups.extend(opt.param_groups)
    
        def step(self, closure=None):
            loss = None
            if closure is not None:
                loss = closure()
            for opt in self.optimizers:
                opt.step()
            return loss
    
        def zero_grad(self, set_to_none=True):
            for opt in self.optimizers:
                opt.zero_grad(set_to_none=set_to_none)
    
        def state_dict(self):
            return [opt.state_dict() for opt in self.optimizers]
    
        def load_state_dict(self, state_dict):
            for opt, sd in zip(self.optimizers, state_dict):
                opt.load_state_dict(sd)

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.can_return_loss = True
        self.accepts_loss_kwargs = False

        if config.n_exp == 1:
            blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
        else:
            blocks = []
            for i in range(config.n_layer):
                use_moe = False if (i < config.stride or i > config.n_layer - config.stride-1)  else True
                blocks.append(Block(config, use_moe=use_moe))
            blocks = nn.ModuleList(blocks)

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            h = blocks,
            ln_f = nn.RMSNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight 
        self.apply(self._init_weights)

        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        return n_params
        
    @torch.no_grad()
    def _init_weights(self, module):
        # Setup base configuration
        scale = self.config.switch_tfm_init_scale if hasattr(self.config, 'switch_tfm_init_scale') else 1.0
        n_layer = self.config.n_layer
        
        if isinstance(module, nn.Linear):
            # Calculate standard fan-in (input dimension)
            w_fan_in = module.weight.shape[-1]
            base_std = (scale / w_fan_in) ** 0.5
            
            # Determine specific scaling per layer type
            if hasattr(module, 'router_marker'):
                # Small std for routers ensures balanced initial expert distribution
                final_std = 0.01
            elif hasattr(module, 'output_proj_marker'):
                # Residual scaling: keeps variance from exploding in deep networks
                final_std = base_std / math.sqrt(2 * n_layer)
            elif hasattr(module, 'is_attention'):
                # Attn weights often benefit from a slight dampener
                final_std = base_std * 0.7
            else:
                # Standard hidden/up-projections
                final_std = base_std
    
            # Apply truncated normal initialization
            torch.nn.init.trunc_normal_(
                module.weight, mean=0.0, std=final_std, a=-2*final_std, b=2*final_std
            )
            
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
    
        # Handling custom Parameter-based MLPExperts
        elif isinstance(module, MLPExperts):
            # UP-PROJECTION (c_fc)
            c_fc_fan_in = module.c_fc.shape[-2]
            final_fc_std = (scale / c_fc_fan_in) ** 0.5
            torch.nn.init.trunc_normal_(module.c_fc, std=final_fc_std, a=-2*final_fc_std, b=2*final_fc_std)
    
            # DOWN-PROJECTION (c_proj)
            c_proj_fan_in = module.c_proj.shape[-2]
            # Residual scaling for MoE outputs
            final_proj_std = ((scale / c_proj_fan_in) ** 0.5) / math.sqrt(2 * n_layer)
            torch.nn.init.trunc_normal_(module.c_proj, std=final_proj_std, a=-2*final_proj_std, b=2*final_proj_std)
    
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
        # elif isinstance(module, nn.RMSNorm):
        #     # Initializing to 0.01 as requested
        #     # Note: 1.0 is standard, 0.01 will significantly dampen initial signal
        #     torch.nn.init.constant_(module.weight, 1.0)

    def forward(self, input_ids, labels=None, attention_mask=None, **kwargs):
        _, t = input_ids.size()
        assert t <= self.config.block_size, f"Sequence length {t} exceeds block size {self.config.block_size}"

        x = self.transformer.wte(input_ids) 
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if labels is not None:
            logits = self.lm_head(x)

            shift_logits = logits[:, :-1, :].contiguous()
            shift_labels = labels[:, 1:].contiguous()

            # print("\n\nlabel: ", shift_labels, "\ninput: ", input_ids)

            loss_fct = nn.CrossEntropyLoss(
                ignore_index=-100,
                label_smoothing=0.1, 
                reduction='mean'
            )

            main_loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)), 
                shift_labels.view(-1)
            )

            loss = main_loss
            
            if self.config.n_exp > 1:
                if self.config.use_aux_loss:
                    loss += self.config.aux_loss_weight * MANAGER.aggregate_aux_loss()
                    MANAGER.reset_aux_loss()
                
                if self.config.use_router_z_loss:
                    loss += self.config.router_z_loss_weight * MANAGER.aggregate_router_z_loss()
                    MANAGER.reset_router_z_loss()
        else:
            logits = self.lm_head(x[:, [-1], :]) 
            loss = None

        return CausalLMOutput(loss=loss, logits=logits)
    
    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # TODO: add expert config
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        # add an extra check for "bias" string to account for bias terms in MoE layers
        decay_params = [p for n, p in param_dict.items() if (p.dim() >= 2 and not n.endswith('bias'))]
        nodecay_params = [p for n, p in param_dict.items() if (p.dim() < 2 or n.endswith('bias'))]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]

            # Correctly unpack the dataclass output
            outputs = self(idx_cond)
            logits = outputs.logits[:, -1, :] / temperature

            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')

            probs = F.softmax(logits, dim=-1)

            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx