File size: 40,847 Bytes

"""Self-contained modeling file for trust_remote_code use.

This file merges mup_models.py and hf_wrapper.py into a single module with no
imports from looped_scaling.*. It is intended to be placed alongside a
config.json that sets ``auto_map`` / ``model_type = "loop-lm"`` so that
HuggingFace's ``from_pretrained(..., trust_remote_code=True)`` can load it
without requiring the looped_scaling package to be installed.

Supported model variants: "base" (MuTransformer), "looped" (LoopedTransformer),
"moe" (MoETransformer), "looped-moe" (LoopedMoETransformer).
"""

import torch
import math
import sys
import torch.nn as nn
import torch.nn.functional as F
from collections.abc import Callable, Iterable
from einops import rearrange, einsum, reduce, repeat
from typing import IO, Any, BinaryIO, Optional
from torch import Tensor
from collections import Counter, defaultdict
from torch.nn.functional import scaled_dot_product_attention as sdpa  # for flash attention
from torch.nn.functional import grouped_mm, silu
from transformers import PretrainedConfig, PreTrainedModel, AutoConfig, AutoModelForCausalLM
from transformers.generation import GenerationMixin
from transformers.modeling_outputs import CausalLMOutputWithPast

BASE_D_MODEL = 128
BASE_D_FF = 384

""" Standard Transformer and Components implemented with muP """


# ---------------------------------------------------------------------------
# Numerically stable softmax (inlined from looped_scaling/utils.py)
# ---------------------------------------------------------------------------

def softmax(logits: Tensor, dim: int) -> Tensor:
    logits = logits.float()
    # get max values over specified dimension
    max_values = torch.max(logits, dim=dim, keepdim=True).values

    # subtract max_values from x so max element is 0
    shifted = logits - max_values  # broadcast should work

    # get exp of shifted terms
    shifted_exps = torch.exp(shifted)

    # get sum of shifted terms
    shifted_exp_sums = torch.sum(shifted_exps, dim=dim, keepdim=True)

    # calculate product
    product = shifted_exps / shifted_exp_sums

    return product


# y = Wx (no bias terms!)
class Linear(nn.Module):
    def __init__(self, in_features, out_features, width_ratio, std_base, device=None, dtype=None):
        super().__init__()

        # Register parameter first so shape is always stored (required for HF meta-device loading)
        self.weight = nn.Parameter(torch.empty(out_features, in_features, dtype=dtype, device=device))

        # for muP, derive initial std deviation from given base model's std_deviation and width ratio
        std_scaled = std_base / math.sqrt(width_ratio)
        nn.init.trunc_normal_(self.weight, mean=0.0, std=std_scaled, a=-3*std_scaled, b=3*std_scaled)

    def forward(self, x: Tensor) -> Tensor:
        # Pytorch standard: on input side of expression, d_in is last dim of x so "... d_in"
        # on output side of einsum expression, so "... d_out" follows convention
        # to put the output dim last
        return einsum(self.weight, x, "d_out d_in, ... d_in -> ... d_out")

class Embedding(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, device=None, dtype=None):
        super().__init__()

        # Register parameter first so shape is always stored (required for HF meta-device loading)
        self.weight = nn.Parameter(torch.empty(num_embeddings, embedding_dim, dtype=dtype, device=device))

        # normalize the embeddings to spec
        nn.init.trunc_normal_(self.weight, mean=0.0, std=1.0, a=-3, b=3)

    def forward(self, token_ids: Tensor) -> Tensor:
        # for every id, we need to pull the row vector associated
        return self.weight[token_ids]

class RMSNorm(nn.Module):
    def __init__(self, d_model: int, eps: float = 1e-5, device=None, dtype=None):
        super().__init__()

        # for muP no gain parameter on the rms
        self.d_model = d_model
        self.eps = eps

    def forward(self, x: Tensor) -> Tensor:
        # upcast input to torch.float32
        in_dtype = x.dtype
        x = x.to(torch.float32)

        # calculate the RMS scalar
        # scalar for every ex. in batch, for every emb in sequence
        mean_squared_sum = (1/self.d_model)*einsum(x, x, "... seq d, ... seq d -> ... seq")
        rms = torch.sqrt(mean_squared_sum + self.eps)

        # for muP, no gain on rms norm as is normally applied.
        rms_norm = einsum(x, 1/rms, "... seq d, ... seq -> ... seq d")

        # return result to original dtype
        return rms_norm.to(in_dtype)

class PositionwiseFeedforward(nn.Module):
    # SwiGLU(x) = W2(SiLU(W1x)⊙W3x)
    def __init__(self, d_model: int, d_ff: int, width_ratio: float, device=None, dtype=None):
        super().__init__()

        # for muP, calculate the base model's standard deviation
        w_std_base = math.sqrt(2/(BASE_D_MODEL+BASE_D_FF))  # same for all W because d_model+d_ff = d_ff+d_model

        # initialize parameters of SWiGLU FFN
        self.w1 = Linear(d_model, d_ff, width_ratio, w_std_base, device=device, dtype=dtype)
        self.w2 = Linear(d_ff, d_model, width_ratio, w_std_base, device=device, dtype=dtype)
        self.w3 = Linear(d_model, d_ff, width_ratio, w_std_base, device=device, dtype=dtype)

    def forward(self, x: Tensor) -> Tensor:
        # FFN = W2*(SiLU(W1*X) dot W3X)
        silu_in = self.w1(x)
        silu_out = silu(silu_in)  # silu_in * torch.sigmoid(silu_in)
        gate = self.w3(x)
        gated_prod = silu_out * gate
        final_prod = self.w2(gated_prod)
        return final_prod

class RotaryPositionalEmbedding(nn.Module):
    def __init__(self, theta: float, d_k: int, max_seq_len: int, device=None, dtype=None):
        """
        theta: float Θ value for the RoPE
        d_k: int dimension of query and key vectors
        max_seq_len: int Maximum sequence length that will be inputted
        device: torch.device | None = None Device to store the buffer on
        """
        super().__init__()
        rotations = torch.empty(max_seq_len, d_k//2, 2, 2, device=device, dtype=dtype)

        # initialize rotation matrix
        for i in range(max_seq_len):
            for k in range(d_k//2):
                angle = i/(theta**(2*k/d_k))
                rot = Tensor([[math.cos(angle), -math.sin(angle)],
                                    [math.sin(angle), math.cos(angle)]])
                rotations[i, k, :] = rot

        self.register_buffer("rotations", rotations, persistent=True)


    def forward(self, x: Tensor, token_positions: Tensor) -> Tensor:
        """
        self.rotations shape: (seq_dim, feature_dim, 2, 2)
        x: tensor of shape (..., seq_dim, feature_dim)
        token_positions: tensor of shape (..., seq_dim)
        """
        # get the correct rotation matrices
        # by default, 0'th dim of array_indexed is index dim, last dim of indices is feature dim
        rot = self.rotations[token_positions].to(dtype=x.dtype)  # match activation dtype (buffer is float32, activations may be bfloat16)

        # rearrange by every two elements along feature dim of input x
        x_pairs = rearrange(x, "... seq_dim (feature_dim i) -> ... seq_dim feature_dim i", i=2)

        # apply rotations to these. for each pairwise position is A@x->y : (ixj)@(j,)->(i,)
        y_pairs = einsum(rot, x_pairs, "... seq_dim feature_dim i j, ... seq_dim feature_dim j -> ... seq_dim feature_dim i")

        # reshape y_pairs back to original shape
        y = rearrange(y_pairs, "... seq_dim feature_dim i -> ... seq_dim (feature_dim i)")

        return y

def scaled_dot_product_attention(
        Q: Tensor,
        K: Tensor,
        V: Tensor,
        mask: Optional[Tensor] = None,
        ) -> Tensor:
    """
    Given key (K), query (Q), and value (V) tensors, return
    the output of your scaled dot product attention implementation.

    Args:
        let m be seq length of inputs, n be seq length of outputs
        d_k is look-up dim, d_v is value dim
        Q (Float[Tensor, "batch ... n d_k"]): Query tensor
        K (Float[Tensor, "batch ... m d_k"]): Key tensor
        V (Float[Tensor, "batch ... m d_v"]): Values tensor
        mask (Float[Tensor, " ... n m"] | None): Mask tensor
    Returns:
        Float[Tensor, " ... n d_v"]: Output of SDPA
    """

    # get the key feature dim (should be last dim of Q and K)
    d_k = Q.shape[-1]
    assert d_k == K.shape[-1]

    # calculate the weighted scores (similarity product). for muP, scale by d_k not sqrt(d_k)
    scores = einsum(Q, K, "... n d_k, ... m d_k -> ... n m") / d_k

    # apply the mask if there is one
    if mask is not None:
        bool_mask = mask.bool()  # compatible if somehow, input is mask bool or if float
        attn_mask = torch.where(bool_mask, 0.0, float('-inf')).to(scores.dtype)
        scores = scores + attn_mask

    # calculate the weighted
    weights = softmax(scores, dim=-1)  # the softmax should be taken over the m inputs at an i'th output pos.

    # return weights@V
    return einsum(weights, V, "... n m, ... m d_v -> ... n d_v")

class MultiheadSelfAttention(nn.Module):
    """
    Args:
        d_model (int): Dimensionality of the feedforward input and output.
        num_heads (int): Number of heads to use in multi-headed attention.
        max_seq_len (int): Maximum sequence length to pre-cache if your implementation does that.
        q_proj_weight (Float[Tensor, "d_k d_in"]): Weights for the Q projection
        k_proj_weight (Float[Tensor, "d_k d_in"]): Weights for the K projection
        v_proj_weight (Float[Tensor, "d_k d_in"]): Weights for the V projection
        o_proj_weight (Float[Tensor, "d_model d_v"]): Weights for the output projection
        in_features (Float[Tensor, "... sequence_length d_in"]): Tensor to run your implementation on.

    Returns:
        Float[Tensor, " ... sequence_length d_out"]: Tensor with the output of running your optimized, batched multi-headed attention
        implementation with the given QKV projection weights and input features.
    """
    def __init__(self, d_model: int, num_heads: int, max_seq_len: int = None, theta: float = None, width_ratio: float = 1.0, device=None, dtype=None):
        super().__init__()

        # initialize the multi-head self attention weights as 1 large matrix (which will be sliced)
        assert d_model % num_heads == 0, f"d_model ({d_model}) must be divisible by num_heads ({num_heads})"

        self.d_model = d_model
        self.num_heads = num_heads

        # for muP, calculate standard deviation of base model
        attn_std_base = math.sqrt(2/(BASE_D_MODEL+BASE_D_MODEL))

        # for muP, initialize the Wq,Wk,Wv,Wo linear weights with width_ratio and base model's stddev
        self.q_proj = Linear(d_model, d_model, width_ratio, attn_std_base, device=device, dtype=dtype)
        self.k_proj = Linear(d_model, d_model, width_ratio, attn_std_base, device=device, dtype=dtype)
        self.v_proj = Linear(d_model, d_model, width_ratio, attn_std_base, device=device, dtype=dtype)
        self.output_proj = Linear(d_model, d_model, width_ratio, attn_std_base, device=device, dtype=dtype)

        # # Removed for torch sdpa, uncomment if using normal code
        # if max_seq_len:
        #     causal_mask = torch.tril(torch.ones(max_seq_len, max_seq_len, dtype=dtype, device=device))
        #     self.register_buffer("causal_mask", causal_mask, persistent=False)
        # else:
        #     self.register_buffer("causal_mask", None, persistent=False)

        assert theta is None or max_seq_len is not None, "max_seq_len must be provided when theta is given for multi-head self attention with RoPE."

        if theta:
            d_k = d_model//num_heads
            self.rope = RotaryPositionalEmbedding(theta, d_k, max_seq_len, device, dtype)
        else:
            self.rope = None

    def forward(self, x: Tensor, token_positions: Optional[Tensor] = None) -> Tensor:
        # get Q, K, V matrices
        Q = self.q_proj(x)  # output shape is [batch seq d_model]
        K = self.k_proj(x)
        V = self.v_proj(x)

        # #create causal mask intepreting the second to last dim as seq dim
        # if self.causal_mask is None:
        #     seq_dim = x.shape[-2]
        #     cmask = torch.tril(torch.ones(seq_dim, seq_dim, dtype=x.dtype, device=x.device))
        # else:
        #     # Slice the pre-computed mask to match actual sequence length (could be < than max_seq_len)
        #     seq_dim = x.shape[-2]
        #     cmask = self.causal_mask[:seq_dim, :seq_dim]

        # get slice size for multi-head self attention
        d_k = self.d_model // self.num_heads
        d_v = self.d_model // self.num_heads

        q_heads = rearrange(Q, "... seq (heads d_k) -> ... heads seq d_k", d_k=d_k)
        k_heads = rearrange(K, "... seq (heads d_k) -> ... heads seq d_k", d_k=d_k)

        # apply RoPE to q_heads and k_heads
        if self.rope:
            seq_dim = x.shape[-2]  # x is (b,s,d)
            if token_positions is None:
                token_positions = torch.arange(seq_dim, device=x.device)
                token_positions = rearrange(token_positions, "seq -> 1 seq")  # 1 seq allows broadcast across batch dim

            q_heads = self.rope(q_heads, token_positions)
            k_heads = self.rope(k_heads, token_positions)

        v_heads = rearrange(V, "... seq (heads d_v) -> ... heads seq d_v", d_v=d_v)

        #mha_heads = scaled_dot_product_attention(q_heads, k_heads, v_heads, cmask)
        mha_heads = sdpa(q_heads, k_heads, v_heads, is_causal=True, scale=1.0/d_k)
        mha = rearrange(mha_heads, "... heads seq d_v -> ... seq (heads d_v)")

        # apply o_proj_weight to the concatenated multi-head attention product
        out = self.output_proj(mha)

        return out

class PrenormBlock(nn.Module):
    def __init__(self,
                 d_model: int,
                 num_heads: int,
                 d_ff: int,
                 max_seq_len: int,
                 theta: float,
                 width_ratio: float,
                 device=None,
                 dtype=None):
        super().__init__()
        # norm layer
        self.ln1 = RMSNorm(d_model, device=device, dtype=dtype)
        # mhsa with rope
        self.attn = MultiheadSelfAttention(d_model, num_heads, max_seq_len, theta, width_ratio, device, dtype)
        # add step
        # norm layer
        self.ln2 = RMSNorm(d_model, device=device, dtype=dtype)
        # positionwise feed forward
        self.ffn = PositionwiseFeedforward(d_model, d_ff, width_ratio, device, dtype)
        # add to output

    def forward(self, x: Tensor, token_positions: Optional[Tensor] = None) -> Tensor:

        # first Tx operation, Norm + MHSA w/ RoPE
        norm1_out = self.ln1(x)
        # we may have to define token_positions if it is not given
        attn_out = self.attn(norm1_out, token_positions)

        # ensure no broadcasting, elementwise addition on [batch seq d_model]
        assert(x.shape == attn_out.shape)
        resid1_out = attn_out + x

        # second Tx operation, Norm + SwiGLU
        norm2_out = self.ln2(resid1_out)
        ffn_out = self.ffn(norm2_out)

        # ensure no broadcasting, elementwise addition
        assert(ffn_out.shape == resid1_out.shape)
        final_out = resid1_out + ffn_out
        return final_out

class MuTransformer(nn.Module):
    def __init__(
            self, vocab_size: int,
            context_length: int,
            d_model: int,
            num_layers: int,
            num_heads: int,
            d_ff: int,
            rope_theta: float,
            width_ratio: float = 1.0,
            weight_tying: bool = False,
            device=None, dtype=None):
       super().__init__()
       self.token_embeddings = Embedding(vocab_size, d_model, device=device, dtype=dtype)
       self.layers = nn.ModuleList([PrenormBlock(d_model, num_heads, d_ff, context_length, rope_theta, width_ratio, device, dtype) for _ in range(num_layers)])
       self.ln_final = RMSNorm(d_model, device=device, dtype=dtype)
       self.weight_tying = weight_tying
       if weight_tying:
           self.lm_head = self.token_embeddings.weight
       else:
           std_base_lm_head = math.sqrt(2/(BASE_D_MODEL+vocab_size))
           self.lm_head = Linear(d_model, vocab_size, width_ratio=width_ratio, std_base=std_base_lm_head, device=device, dtype=dtype)
       self.width_ratio = width_ratio

    def forward(self, x: Tensor) -> Tensor:
        # 1. token embed step, no muP alpha_in
        x = self.token_embeddings(x)

        # 2. prenorm blocks step
        for layer in self.layers:
            x = layer(x)

        # 3. Final norm
        x = self.ln_final(x)

        # 4. unembed layer, muP implemented as scaling on init variance and lr of lm_head, not output scaling
        if self.weight_tying:
            x = einsum(x, self.lm_head, "... s d, v d -> ... s v")/self.width_ratio
        else:
            x = self.lm_head(x)

        # 5. return output, no muP alpha_out
        return x

""" Looped Language Models implemented with MuP """

class LoopedStack(nn.Module):
    def __init__(
            self,
            context_length: int,
            d_model: int,
            num_layers_in_stack: int,
            num_heads: int,
            d_ff: int,
            rope_theta: float,
            width_ratio: float = 1.0,
            mixture_of_experts: bool = False,
            num_experts: Optional[int] = None,
            num_active: Optional[int] = None,
            device=None, dtype=None):
       super().__init__()
       if mixture_of_experts:
           # self.layers = nn.ModuleList([MoEPrenormBlock(d_model,num_heads,d_ff,num_experts,num_active,
           #                                              context_length,rope_theta,width_ratio,device,dtype)
           #                              for _ in range(num_layers_in_stack)])
           self.layers = nn.ModuleList([GroupedMoEPrenormBlock(d_model, num_heads, d_ff, num_experts, num_active,
                                                               context_length, rope_theta, width_ratio, device, dtype)
                                        for _ in range(num_layers_in_stack)])
       else:
            self.layers = nn.ModuleList([PrenormBlock(d_model, num_heads, d_ff, context_length, rope_theta,
                                                      width_ratio, device, dtype) for _ in range(num_layers_in_stack)])
       self.mixture_of_experts = mixture_of_experts

    def forward(self, x: Tensor) -> Tensor:
        # prenorm blocks step
        if self.mixture_of_experts:
            lb_total = 0
            lz_total = 0
            # sum up load balancing and z-losses across each layer
            for layer in self.layers:
                x, lb, lz = layer(x)
                lb_total += lb
                lz_total += lz
            return x, lb_total, lz_total
        else:
            for layer in self.layers:
                x = layer(x)
            return x

class LoopedTransformer(nn.Module):
    def __init__(
            self,
            vocab_size: int,
            context_length: int,
            d_model: int,
            num_layers_in_stack: int,
            num_stacks: int,
            num_heads: int,
            d_ff: int,
            rope_theta: float,
            width_ratio: float = 1.0,
            weight_tying: bool = False,
            device=None, dtype=None):
       super().__init__()
       self.num_stacks = num_stacks

       self.token_embeddings = Embedding(vocab_size, d_model, device=device, dtype=dtype)
       self.stack = LoopedStack(context_length, d_model, num_layers_in_stack, num_heads, d_ff, rope_theta, width_ratio, device=device, dtype=dtype)
       self.ln_final = RMSNorm(d_model, device=device, dtype=dtype)
       self.weight_tying = weight_tying
       self.width_ratio = width_ratio

       if weight_tying:
           self.lm_head = self.token_embeddings.weight
       else:
           std_base_lm_head = math.sqrt(2/(BASE_D_MODEL+vocab_size))
           self.lm_head = Linear(d_model, vocab_size, width_ratio, std_base_lm_head, device=device, dtype=dtype)

    def forward(self, x: Tensor) -> Tensor:
        # token embed step
        x = self.token_embeddings(x)

        # repeated calls to stack
        for i in range(self.num_stacks):
            x = self.stack(x)

        # final norm
        x = self.ln_final(x)

        # Vocab projection or lm_head
        if self.weight_tying:
            x = einsum(x, self.lm_head, "... s d, v d -> ... s v")/self.width_ratio
        else:
            x = self.lm_head(x)

        return x

""" Mixture-of-Experts Implementation in muP """

# Router Class
class Router(nn.Module):
    def __init__(self, d_model: int, num_experts: int, num_active=None, width_ratio: float = 1.0, device=None, dtype=None):
        super().__init__()
        # router is simply a linear layer. we initialize (d_in, d_out) according to my code
        std_base = math.sqrt(2/(BASE_D_MODEL+num_experts))
        self.gate = Linear(d_model, num_experts, width_ratio, std_base, device=device, dtype=dtype)  # adjusted for muP
        self.num_active = num_active

    def forward(self, x: Tensor):
        # returns scores, top_k_scores, top_k_indices
        logits = self.gate(x)  # should be shape (batch, seq, n_routers)

        # probs
        probs = softmax(logits, dim=-1)

        # get top_k
        top_scores, top_experts = torch.topk(probs, k=self.num_active, dim=-1)

        # renormalize the top scores so weighted sum of expert products can be taken
        score_sums = torch.sum(top_scores, dim=-1, keepdim=True)  # (batch, seq)
        top_scores = top_scores/score_sums

        return logits, probs, top_scores, top_experts

class MoEPrenormBlock(nn.Module):
    def __init__(self, d_model: int, num_heads: int, d_ff: int, num_experts: int, num_active: int,
                  max_seq_len: int, theta: float, width_ratio: float = 1.0, device=None, dtype=None):
        super().__init__()
        # norm layer before mHSA+RoPE
        self.ln1 = RMSNorm(d_model, device=device, dtype=dtype)

        # mhsa with rope
        self.attn = MultiheadSelfAttention(d_model, num_heads, max_seq_len, theta, width_ratio, device, dtype)

        # norm layer before position-wise feedfoward
        self.ln2 = RMSNorm(d_model, device=device, dtype=dtype)

        # router
        self.router = Router(d_model, num_experts, num_active, width_ratio=width_ratio, device=device, dtype=dtype)

        # save MoE hyperparams
        self.num_experts = num_experts
        self.num_active = num_active

        # initialize MoE FFNs as a module list
        d_ff_expert = d_ff // num_active
        self.experts = nn.ModuleList([PositionwiseFeedforward(d_model, d_ff_expert, width_ratio, device, dtype) for _ in range(num_experts)])  # adjusted for muP

    def forward(self, x: Tensor, token_positions: Optional[Tensor] = None) -> Tensor:
        # input dims
        batch, seq, dim = x.shape

        # first Tx operation, Norm + MHSA w/ RoPE
        norm1_out = self.ln1(x)
        # we may have to define token_positions if it is not given
        attn_out = self.attn(norm1_out, token_positions)

        # ensure no broadcasting, elementwise addition on [batch seq d_model]
        assert(x.shape == attn_out.shape)
        resid1_out = attn_out + x

        # prenorm before position-wise feedforward
        norm2_out = self.ln2(resid1_out)

        # get scores from Router. returns shape (batch,seq,k)
        logits, probs, top_scores, top_experts = self.router(norm2_out)  # logits and probs are (batch, seq, n_routers)
        expert_mean_probs = torch.mean(probs, dim=(0, 1))  # take mean across batch and seq dims

        # apply mixture of experts
        experts_out = torch.zeros_like(norm2_out)  # copies shape, device and dtype
        total_tokens_assigned = batch*seq*self.num_active
        lb_sum = 0

        for expert_idx in range(self.num_experts):
            # get masks for expert selection
            expert_mask = (top_experts == expert_idx)
            embed_mask = expert_mask.any(dim=-1)  # if any of the k is expert, we want to transform embed
            if not embed_mask.any(): continue
            pi = expert_mean_probs[expert_idx].item()
            fi = (expert_mask.sum().item())/total_tokens_assigned  # num embeds assigned to expert in batch
            lb_sum += fi*pi

            # extract embeds and weights for activated experts
            weights = top_scores[expert_mask]  # (num_embeds)
            expert_embeds = norm2_out[embed_mask]  # (num_embeds, hidden_dim)

            # forward for the correct experts
            expert_out = self.experts[expert_idx](expert_embeds)  # Vanilla Implementation

            # map back to experts output
            experts_out[embed_mask] += weights.unsqueeze(-1)*expert_out  # broadcast elementwise multiply by hidden dim

        # calculate batch's load balancing loss
        lb = self.num_experts*lb_sum

        # calculate batch's router z loss
        logsumexp = torch.logsumexp(logits.float(), dim=-1)
        lz = torch.mean(logsumexp ** 2)

        # ensure no broadcasting, elementwise addition
        assert(experts_out.shape == resid1_out.shape)
        final_out = resid1_out + experts_out
        return final_out, lb, lz


class GroupedMoEPrenormBlock(nn.Module):
    @staticmethod
    def _init_expert_weights(num_experts, in_features, out_features, width_ratio, std_base, device, dtype) -> nn.Parameter:
        w = torch.empty(num_experts, in_features, out_features, device=device, dtype=dtype)  # (batch, in, out)
        std_scaled = std_base / math.sqrt(width_ratio)
        nn.init.trunc_normal_(w, mean=0.0, std=std_scaled, a=-3*std_scaled, b=3*std_scaled)
        return nn.Parameter(w)

    def __init__(self, d_model: int, num_heads: int, d_ff: int, num_experts: int, num_active: int,
                  max_seq_len: int, theta: float, width_ratio: float = 1.0, device=None, dtype=None):
        super().__init__()
        # norm layer before mHSA+RoPE
        self.ln1 = RMSNorm(d_model, device=device, dtype=dtype)

        # mhsa with rope
        self.attn = MultiheadSelfAttention(d_model, num_heads, max_seq_len, theta, width_ratio, device, dtype)

        # norm layer before position-wise feedfoward
        self.ln2 = RMSNorm(d_model, device=device, dtype=dtype)

        # router
        self.router = Router(d_model, num_experts, num_active, width_ratio=width_ratio, device=device, dtype=dtype)

        # save MoE hyperparams
        self.num_experts = num_experts
        self.num_active = num_active

        # initialize MoE FFNs as a module list
        d_ff_expert = d_ff // num_active

        # expose and stack the MoE SwiGLU weights for all experts. with experts in string, optimizer scales weights by width_ratio
        w_std_base = math.sqrt(2 / (BASE_D_MODEL + BASE_D_FF))
        self.experts_w1 = self._init_expert_weights(num_experts, d_model, d_ff_expert, width_ratio, w_std_base, device, dtype)
        self.experts_w2 = self._init_expert_weights(num_experts, d_ff_expert, d_model, width_ratio, w_std_base, device, dtype)
        self.experts_w3 = self._init_expert_weights(num_experts, d_model, d_ff_expert, width_ratio, w_std_base, device, dtype)

    def forward(self, x: Tensor, token_positions: Optional[Tensor] = None) -> Tensor:
        batch, seq, dim = x.shape
        total_tokens = batch * seq

        # first Tx operation, Norm + MHSA w/ RoPE
        norm1_out = self.ln1(x)
        attn_out = self.attn(norm1_out, token_positions)

        assert(x.shape == attn_out.shape)
        resid1_out = attn_out + x

        # prenorm before position-wise feedforward
        norm2_out = self.ln2(resid1_out)

        # get scores from Router. returns shape (batch, seq, k)
        logits, probs, top_scores, top_experts = self.router(norm2_out)

        # flatten to 2D for grouped_mm
        x_flat = rearrange(norm2_out, 'b s d -> (b s) d')                         # (total_tokens, d_model)
        flat_expert_ids = rearrange(top_experts, 'b s k -> (b s k)')               # (total_tokens * k,)
        flat_scores = rearrange(top_scores, 'b s k -> (b s k)')                    # (total_tokens * k,)
        flat_positions = torch.arange(total_tokens, device=x.device)               # (total_tokens)
        flat_token_ids = repeat(flat_positions, 'n -> (n k)', k=self.num_active)   # (total_tokens * k)

        # sort by expert
        sort_indices = flat_expert_ids.argsort(stable=True)
        sorted_expert_ids = flat_expert_ids[sort_indices]
        sorted_token_ids = flat_token_ids[sort_indices]
        sorted_scores = flat_scores[sort_indices]
        sorted_x = x_flat[sorted_token_ids]                                        # (total_tokens * k, d_model)

        # build offs (cumulative token counts per expert)
        counts = torch.bincount(sorted_expert_ids, minlength=self.num_experts)
        offs = counts.cumsum(0).to(torch.int32)                                    # (num_experts,)

        # grouped SwiGLU: W2(SiLU(W1 x) dot W3 x)
        h1 = grouped_mm(sorted_x, self.experts_w1, offs=offs)
        h3 = grouped_mm(sorted_x, self.experts_w3, offs=offs)
        gated = silu(h1) * h3
        expert_out = grouped_mm(gated, self.experts_w2, offs=offs)                # (total_tokens * k, d_model)

        # weight by router scores and scatter-add back
        expert_out = einsum(expert_out, sorted_scores, 'n d, n -> n d')
        output_flat = torch.zeros(total_tokens, dim, device=x.device, dtype=expert_out.dtype)
        output_flat.index_add_(0, sorted_token_ids, expert_out)

        # reshape back to (batch, seq, d_model)
        experts_out = rearrange(output_flat, '(b s) d -> b s d', b=batch, s=seq)

        # aux losses
        fi = counts.float() / (total_tokens * self.num_active)
        pi = reduce(probs, 'b s e -> e', 'mean')
        lb = self.num_experts * einsum(fi, pi, 'e, e ->')

        logsumexp = torch.logsumexp(logits.float(), dim=-1)
        lz = reduce(logsumexp ** 2, '... -> ', 'mean')

        # residual connection
        assert(experts_out.shape == resid1_out.shape)
        final_out = resid1_out + experts_out
        return final_out, lb, lz


# MoE Implementation
class MoETransformer(nn.Module):
    def __init__(
            self, vocab_size: int,
            context_length: int,
            d_model: int,
            num_layers: int,
            num_heads: int,
            d_ff: int,
            num_experts: int,
            num_active: int,
            rope_theta: float,
            width_ratio: float = 1.0,
            device=None, dtype=None):
       super().__init__()
       self.token_embeddings = Embedding(vocab_size, d_model, device=device, dtype=dtype)
       self.num_layers = num_layers
       # self.layers = nn.ModuleList([MoEPrenormBlock(d_model,num_heads,d_ff,num_experts,num_active,
       #                                              context_length,rope_theta,width_ratio,device,dtype) for _ in range(num_layers)])
       self.layers = nn.ModuleList([GroupedMoEPrenormBlock(d_model, num_heads, d_ff, num_experts, num_active,
                                                           context_length, rope_theta, width_ratio, device, dtype) for _ in range(num_layers)])
       self.ln_final = RMSNorm(d_model, device=device, dtype=dtype)

       # only non-tied embeddings now
       std_base_lm_head = math.sqrt(2/(BASE_D_MODEL+vocab_size))
       self.lm_head = Linear(d_model, vocab_size, width_ratio=width_ratio, std_base=std_base_lm_head, device=device, dtype=dtype)

    def forward(self, x: Tensor) -> Tensor:
        # collect aux losses
        lb_total = 0
        lz_total = 0

        # 1. token embed step
        x = self.token_embeddings(x)

        # 2. prenorm blocks step
        for layer in self.layers:
            x, lb, lz = layer(x)
            lb_total += lb
            lz_total += lz

        # 3. Final norm
        x = self.ln_final(x)

        # 4. Vocab projection or lm_head
        x = self.lm_head(x)

        # calculate average layer aux loss
        lb_avg = lb_total / self.num_layers
        lz_avg = lz_total / self.num_layers

        return x, lb_avg, lz_avg

class LoopedMoETransformer(nn.Module):
    def __init__(
            self, vocab_size: int,
            context_length: int,
            d_model: int,
            num_layers_in_stack: int,
            num_stacks: int,
            num_heads: int,
            d_ff: int,
            num_experts: int,
            num_active: int,
            rope_theta: float,
            width_ratio: float,
            device=None, dtype=None):
       super().__init__()
       self.stack_depth = num_stacks
       self.total_layers = num_stacks*num_layers_in_stack
       self.token_embeddings = Embedding(vocab_size, d_model, device=device, dtype=dtype)
       self.stack = LoopedStack(context_length, d_model, num_layers_in_stack, num_heads,
                                d_ff, rope_theta, width_ratio, mixture_of_experts=True,
                                num_experts=num_experts, num_active=num_active,
                                device=device, dtype=dtype)  # parameters for loop with MoE
       self.ln_final = RMSNorm(d_model, device=device, dtype=dtype)

       # scale lm head
       std_base_lm_head = math.sqrt(2/(BASE_D_MODEL+vocab_size))
       self.lm_head = Linear(d_model, vocab_size, width_ratio=width_ratio, std_base=std_base_lm_head, device=device, dtype=dtype)


    def forward(self, x: Tensor) -> Tensor:
        # collect aux losses
        lb_total = 0
        lz_total = 0

        # token embed step
        x = self.token_embeddings(x)

        # repeated calls to stack
        for i in range(self.stack_depth):
            x, lb, lz = self.stack(x)
            lb_total += lb
            lz_total += lz

        # final norm
        x = self.ln_final(x)

        # Vocab projection or lm_head
        x = self.lm_head(x)

        # calculate aux loss averages
        lb_avg = lb_total / self.total_layers
        lz_avg = lz_total / self.total_layers

        return x, lb_avg, lz_avg


# ---------------------------------------------------------------------------
# HuggingFace wrapper (from hf_wrapper.py)
# ---------------------------------------------------------------------------

class LoopLMConfig(PretrainedConfig):
    """Config for all four loop-lm model variants."""

    model_type = "loop-lm"

    def __init__(
        self,
        # which of the four architectures to use
        model_variant: str = "base",        # "base" | "looped" | "moe" | "looped-moe"
        # shared
        vocab_size: int = 50257,
        context_length: int = 1024,
        d_model: int = 1024,
        num_heads: int = 16,
        d_ff: int = 2752,
        rope_theta: float = 10000.0,
        width_ratio: float = 8.0,           # d_model / base_d_model (128); set at training time
        # base + moe only
        num_layers: int = 16,
        # base + looped only
        weight_tying: bool = False,
        # looped + looped-moe only
        num_layers_in_stack: int = 8,
        num_stacks: int = 2,
        # moe + looped-moe only
        num_experts: int = 8,
        num_active: int = 2,
        # aux loss weights — used when forward() is called with labels
        lb_loss_factor: float = 0.01,
        lz_loss_factor: float = 0.001,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.model_variant = model_variant
        self.vocab_size = vocab_size
        self.context_length = context_length
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_ff = d_ff
        self.rope_theta = rope_theta
        self.width_ratio = width_ratio
        self.num_layers = num_layers
        self.weight_tying = weight_tying
        self.num_layers_in_stack = num_layers_in_stack
        self.num_stacks = num_stacks
        self.num_experts = num_experts
        self.num_active = num_active
        self.lb_loss_factor = lb_loss_factor
        self.lz_loss_factor = lz_loss_factor
        # lm-evaluation-harness looks for this attribute to cap sequence length
        self.max_length = context_length


class LoopLMForCausalLM(PreTrainedModel, GenerationMixin):
    """Causal LM wrapper over all four looped-scaling variants.

    Implements the HuggingFace PreTrainedModel interface so you can:
      - Upload/download via push_to_hub / from_pretrained
      - Run lm-evaluation-harness evals
      - Fine-tune with TRL's SFTTrainer / DPOTrainer
    """

    config_class = LoopLMConfig
    # tell HF which parameter holds the output logits for generation
    _keys_to_ignore_on_load_missing = []

    def __init__(self, config: LoopLMConfig):
        super().__init__(config)
        self.model = self._build_inner_model(config)
        self.post_init()

    # ------------------------------------------------------------------
    # Model construction
    # ------------------------------------------------------------------

    def _build_inner_model(self, config: LoopLMConfig):
        kw = dict(
            vocab_size=config.vocab_size,
            context_length=config.context_length,
            d_model=config.d_model,
            num_heads=config.num_heads,
            d_ff=config.d_ff,
            rope_theta=config.rope_theta,
            width_ratio=config.width_ratio,
            # device=None so weights are placed on CPU; caller uses .to(device)
        )
        v = config.model_variant
        if v == "base":
            return MuTransformer(
                **kw,
                num_layers=config.num_layers,
                weight_tying=config.weight_tying,
            )
        elif v == "looped":
            return LoopedTransformer(
                **kw,
                num_layers_in_stack=config.num_layers_in_stack,
                num_stacks=config.num_stacks,
                weight_tying=config.weight_tying,
            )
        elif v == "moe":
            return MoETransformer(
                **kw,
                num_layers=config.num_layers,
                num_experts=config.num_experts,
                num_active=config.num_active,
            )
        elif v == "looped-moe":
            return LoopedMoETransformer(
                **kw,
                num_layers_in_stack=config.num_layers_in_stack,
                num_stacks=config.num_stacks,
                num_experts=config.num_experts,
                num_active=config.num_active,
            )
        else:
            raise ValueError(f"Unknown model_variant: {v!r}. Choose from: base, looped, moe, looped-moe")

    # ------------------------------------------------------------------
    # Embedding access (required by some HF utilities)
    # ------------------------------------------------------------------

    def get_input_embeddings(self):
        return self.model.token_embeddings

    def set_input_embeddings(self, value):
        self.model.token_embeddings = value

    # ------------------------------------------------------------------
    # Forward
    # ------------------------------------------------------------------

    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None,   # causal mask is handled internally
        labels: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> CausalLMOutputWithPast:
        """
        Args:
            input_ids: (batch, seq)
            attention_mask: ignored — models use a built-in causal mask
            labels: (batch, seq) token ids; if provided, returns cross-entropy loss.
                    For MoE variants, aux losses (lb + lz) are added to the CE loss.
        """
        is_moe = self.config.model_variant in ("moe", "looped-moe")

        if is_moe:
            logits, lb, lz = self.model(input_ids)
        else:
            logits = self.model(input_ids)
            lb = lz = 0.0

        loss = None
        if labels is not None:
            ce_loss = F.cross_entropy(
                logits.view(-1, logits.size(-1)),
                labels.view(-1),
            )
            aux = self.config.lb_loss_factor * lb + self.config.lz_loss_factor * lz
            loss = ce_loss + aux if self.training else ce_loss

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
        )

    # ------------------------------------------------------------------
    # Generation support (no KV cache — generation is correct but slow)
    # ------------------------------------------------------------------

    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        return {"input_ids": input_ids}