firdavsus commited on Apr 16

Commit

943bd92

verified ·

1 Parent(s): a14a7af

Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

.gitattributes +3 -0
.ipynb_checkpoints/load-checkpoint.py +7 -0
LLM_2.py +709 -0
full_diagnostics.png +3 -0
load.py +7 -0
optimizer.pt +3 -0
pytorch_model.bin +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer_config.json +215 -0
trainer_state.json +0 -0
training_args.bin +3 -0
training_curves_with_eval.png +3 -0
weight_histograms.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+full_diagnostics.png filter=lfs diff=lfs merge=lfs -text
+training_curves_with_eval.png filter=lfs diff=lfs merge=lfs -text
+weight_histograms.png filter=lfs diff=lfs merge=lfs -text

.ipynb_checkpoints/load-checkpoint.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from huggingface_hub import login, upload_folder
+login()
+upload_folder(folder_path=".", repo_id="firdavsus/LLM_D4", repo_type="model")

LLM_2.py ADDED Viewed

	@@ -0,0 +1,709 @@

+import math
+import inspect
+from dataclasses import dataclass
+from contextlib import nullcontext
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Tuple
+import inspect
+from transformers.modeling_outputs import CausalLMOutput
+from manager import MANAGER
+torch.manual_seed(101)
+def precompute_freqs_cis(config):
+    # We now return cos and sin directly instead of a complex polar tensor
+    freqs = 1.0 / (config.theta ** (torch.arange(0, config.d_rotate, 2)[: (config.d_rotate // 2)].float() / config.d_rotate))
+    t = torch.arange(config.block_size, device=freqs.device)
+    freqs = torch.outer(t, freqs).float() # [seq_len, d_rotate/2]
+    # Cos and Sin are what Inductor can easily optimize
+    cos = torch.cos(freqs)
+    sin = torch.sin(freqs)
+    # Repeat along the last dimension to match the d_rotate size
+    # [seq_len, d_rotate/2] -> [seq_len, d_rotate]
+    cos = torch.repeat_interleave(cos, 2, dim=-1)
+    sin = torch.repeat_interleave(sin, 2, dim=-1)
+    return cos, sin
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    # x: [..., d_rotate]
+    # Split into [x1, x2, x3, x4...] -> x1, x2 are pairs
+    # We use the interleaving pattern: [-x2, x1, -x4, x3...]
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+def apply_rotary_emb(xq, xk, freqs_cos, freqs_sin):
+    # Reshape freqs for broadcasting: [seq_len, d_rotate] -> [1, seq_len, 1, d_rotate]
+    # This matches (batch, seq, head, dim)
+    cos = freqs_cos[:xq.shape[1]].view(1, xq.shape[1], 1, xq.shape[-1])
+    sin = freqs_sin[:xq.shape[1]].view(1, xq.shape[1], 1, xq.shape[-1])
+    # The RoPE formula: x_out = x * cos + rotate_half(x) * sin
+    xq_out = (xq * cos) + (rotate_half(xq) * sin)
+    xk_out = (xk * cos) + (rotate_half(xk) * sin)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class MultiHeadLatentAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.d_model = config.n_embd
+        self.num_head = config.n_head
+        self.d_head = self.d_model // self.num_head
+        self.d_c = config.d_c
+        self.d_c1 = config.d_c1
+        self.d_rotate = config.d_rotate
+        # ==========================================
+        # FUSION 1: All Projections from 'x'
+        # Replaces DQ_proj, DKV_proj, and RK_proj
+        # ==========================================
+        self.W_down = nn.Linear(
+            self.d_model,
+            self.d_c1 + self.d_c + self.d_rotate,
+            bias=config.bias
+        )
+        self.W_down.is_attention = True
+        # ==========================================
+        # FUSION 2: All Q Up-Projections from 'C_Q'
+        # Replaces UQ_proj and RQ_proj
+        # ==========================================
+        self.W_up_q = nn.Linear(
+            self.d_c1,
+            self.d_model + (self.num_head * self.d_rotate),
+            bias=config.bias
+        )
+        self.W_up_q.is_attention = True
+        # ==========================================
+        # FUSION 3: All KV Up-Projections from 'C_KV'
+        # Replaces UK_proj and UV_proj (STILL STRICTLY SEPARATE WEIGHTS)
+        # ==========================================
+        self.W_up_kv = nn.Linear(
+            self.d_c,
+            self.d_model + self.d_model, # d_model for K, d_model for V
+            bias=config.bias
+        )
+        self.W_up_kv.is_attention = True
+        self.q_norm = nn.RMSNorm(self.d_c1)
+        self.kv_norm = nn.RMSNorm(self.d_c)
+        # Output projection and Regularization
+        self.output_proj = nn.Linear(self.d_model, self.d_model, bias=config.bias)
+        self.output_proj.output_proj_marker = True
+        self.output_proj.is_attention = True
+        self.dropout = nn.Dropout(config.dropout)
+        self.attn_dropout_p = config.dropout
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        cos, sin = precompute_freqs_cis(config)
+        self.register_buffer("freqs_cos", cos, persistent=False)
+        self.register_buffer("freqs_sin", sin, persistent=False)
+    def forward(self, x):
+        batch_size, seq_len, _ = x.size()
+        # ---------------------------------------------------------
+        # 1. KERNEL 1: Down-project everything at once
+        # ---------------------------------------------------------
+        down_out = self.W_down(x)
+        # Split into the 3 exact latents your math requires
+        C_Q, C_KV, K_rotate = down_out.split(
+            [self.d_c1, self.d_c, self.d_rotate], dim=-1
+        )
+        C_Q = self.q_norm(C_Q)
+        C_KV = self.kv_norm(C_KV)
+        # ---------------------------------------------------------
+        # 2. KERNEL 2: Up-project Query content and RoPE
+        # ---------------------------------------------------------
+        q_up_out = self.W_up_q(C_Q)
+        Q_state, Q_rotate = q_up_out.split(
+            [self.d_model, self.num_head * self.d_rotate], dim=-1
+        )
+        Q_state = Q_state.view(batch_size, seq_len, self.num_head, self.d_head)
+        Q_rotate = Q_rotate.view(batch_size, seq_len, self.num_head, self.d_rotate)
+        # ---------------------------------------------------------
+        # 3. KERNEL 3: Up-project Key and Value content independently
+        # ---------------------------------------------------------
+        kv_up_out = self.W_up_kv(C_KV)
+        K_state, V_state = kv_up_out.split(
+            [self.d_model, self.d_model], dim=-1
+        )
+        K_state = K_state.view(batch_size, seq_len, self.num_head, self.d_head)
+        V_state = V_state.view(batch_size, seq_len, self.num_head, self.d_head)
+        # Prepare shared RoPE Key
+        K_rotate = K_rotate.view(batch_size, seq_len, 1, self.d_rotate).expand(-1, -1, self.num_head, -1)
+        # ---------------------------------------------------------
+        # 4. Apply RoPE, Concatenate, and Attention
+        # ---------------------------------------------------------
+        Q_rotate, K_rotate = apply_rotary_emb(
+            Q_rotate,
+            K_rotate,
+            self.freqs_cos,
+            self.freqs_sin
+        )
+        Q = torch.cat([Q_state, Q_rotate], dim=-1).transpose(1, 2)
+        K = torch.cat([K_state, K_rotate], dim=-1).transpose(1, 2)
+        V = V_state.transpose(1, 2)
+        if self.flash:
+            att_output = F.scaled_dot_product_attention(
+                Q, K, V,
+                dropout_p=self.attn_dropout_p if self.training else 0.0,
+                is_causal=True
+            )
+        else:
+            scaler = 1.0 / math.sqrt(self.d_head + self.d_rotate)
+            att_matrix = (Q @ K.transpose(-2, -1)) * scaler
+            mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device)).view(1, 1, seq_len, seq_len)
+            att_matrix = att_matrix.masked_fill(mask == 0, float('-inf'))
+            att_score = self.dropout(F.softmax(att_matrix, dim=-1))
+            att_output = att_score @ V
+        att_output = att_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
+        return self.output_proj(att_output)
+class Router(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # router settings
+        self.top_k = config.top_k
+        self.n_exp = config.n_exp
+        assert self.top_k >= 1 and self.top_k <= config.n_exp
+        self.use_noisy_top_k = config.use_noisy_top_k
+        self.train_capacity = config.train_capacity
+        self.eval_capacity = config.eval_capacity
+        self.min_capacity = config.min_capacity
+        self.router_use_full_prec = config.router_use_full_prec
+        # auxiliary / load balancing loss settings
+        self.use_aux_loss = config.use_aux_loss
+        self.use_router_z_loss = config.use_router_z_loss
+        # linear projection for (noisy) softmax gating
+        # no bias is used, see page 4 eq (4) in (https://arxiv.org/abs/1701.06538)
+        self.w_g = nn.Linear(config.n_embd, config.n_exp, bias=False)
+        self.w_g.router_marker = True
+        self.w_noise = nn.Linear(config.n_embd, config.n_exp, bias=False) if self.use_noisy_top_k else None
+    def forward(self, x):
+        # optionally run the router in full precision to avoid instability during training
+        # see discussion on pg. 9 here: https://arxiv.org/abs/2101.03961
+        # setting enabled to False in autocast automatically puts everything in float32
+        device_type = 'cuda' if torch.cuda.is_available() else 'cpu' # for later use in torch.autocast
+        ctx = nullcontext() if not self.router_use_full_prec else torch.amp.autocast(device_type=device_type, enabled=False)
+        with ctx:
+            B, T, _ = x.size()
+            num_tokens = B * T
+            # eq (4) in (https://arxiv.org/abs/1701.06538)
+            logits = self.w_g(x)  # [B, T, n_exp]
+            if self.use_noisy_top_k:
+                # optionally add noise into the router
+                noise = F.softplus(self.w_noise(x))
+                noise *= torch.randn_like(noise)
+                logits += noise
+            # router z loss, computed on logits (before softmax)
+            # this loss prevents router logits from becoming too large
+            if self.use_router_z_loss:
+                z_loss = self.compute_router_z_loss(logits)
+                MANAGER.add_router_z_loss(z_loss)
+            # find top k experts for each token
+            top_k_logits, top_k_indices = logits.topk(self.top_k, dim=-1) # [B, T, k]
+            # normalize expert probabilities
+            # Question: should we normalize over all experts or just top-k?
+            # we choose to normalize over top-k, other option is commented out below
+            # Shazeer et al (https://arxiv.org/abs/1701.06538) does only topk
+            # see page 4 eq (3)-(5), the code for this is commented out below
+            router_probs = torch.full_like(logits, float('-inf'))  # [B, T, n_exp]
+            router_probs.scatter_(-1, top_k_indices, top_k_logits)
+            router_probs = F.softmax(router_probs, dim=-1)
+            # # normalize all router logits (not just top-k) via softmax
+            router_probs = F.softmax(logits, dim=-1)
+            # compute auxiliary load balancing loss
+            # this loss encourages equal probability assigned to each expert
+            # and equal load balancing of tokens assigned to each expert
+            if self.use_aux_loss:
+                aux_loss = self.compute_aux_loss(router_probs, top_k_indices)
+                MANAGER.add_aux_loss(aux_loss)
+            # compute expert capacity
+            exp_capacity = self.get_capacity(num_tokens)
+            # make a multi-hot mask of chosen experts, size [B, T, n_exp]
+            # entries are 0 if expert not chosen and 1 if expert chosen
+            exp_mask = F.one_hot(top_k_indices, num_classes=self.n_exp)  # [B, T, k, n_exp]
+            exp_mask = exp_mask.view(num_tokens, self.top_k, self.n_exp)  # [B * T, k, n_exp]
+            exp_mask = exp_mask.permute(1, 0, 2) # [k, B * T, n_exp]
+            # compute cumulative sum of each token over experts, this stores
+            # the index of each token within the batch of each expert
+            # NOTE: cumsum should count all top-1 first, top-2 second, etc.
+            # so that we prioritize top experts when dropping tokens (this is
+            # done by putting k dimension first for the reshape operation)
+            exp_rank = exp_mask.reshape(self.top_k * num_tokens, self.n_exp)  # [k * B * T, n_exp]
+            exp_rank = torch.cumsum(exp_rank, dim=0) - 1  # cumulative sum of expert selections [k * B * T, n_exp]
+            exp_rank = exp_rank.reshape(self.top_k, num_tokens, self.n_exp)  # [k, B * T, n_exp]
+            # mask out (set to zero) entries that go beyond expert capacity
+            # compute amount of used capacity by taking a sum over mask
+            exp_mask *= torch.lt(exp_rank, exp_capacity) # [k, B * T, n_exp]
+            used_capacity = torch.sum(exp_mask, dim=(0, 1)) # [n_exp]
+            # mask rank to only include tokens that are selected
+            # perform a sum so each row only contains index of token
+            # for the expert that is selected in that row
+            # result is a matrix that contains the position of each token
+            # in the batch of its corresponding expert
+            exp_rank = torch.sum(exp_mask * exp_rank, dim=-1)  # [k, B * T]
+            # mask probabilities to only include selected experts
+            router_probs = router_probs.view(num_tokens, self.n_exp)[None, :] # [1, B * T, n_exp]
+            exp_weights = exp_mask * router_probs # [k, B * T, n_exp]
+            # convert rank into one-hot vectors over the available capacity
+            # stores the position of each token within the capacity of the selected expert
+            exp_rank_sc = F.one_hot(exp_rank, num_classes=exp_capacity) # [k, B * T, exp_capacity]
+            # create a vector that stores, for each token, the weight of selected
+            # experts at token's position in the capacity of that expert
+            # size of tensor is [B * T, n_exp, exp_capacity]
+            cb_weight = torch.sum(exp_weights.unsqueeze(3) * exp_rank_sc.unsqueeze(2), dim=0)
+            sec_mask = cb_weight.bool() # binary mask of selected experts for each token
+            return used_capacity, cb_weight, sec_mask
+    def compute_aux_loss(self, expert_probs: torch.Tensor, indices: torch.Tensor):
+        """
+        Computes Switch Transformer auxiliary loss (https://arxiv.org/abs/2101.03961)
+        See equations (4)-(6) on page 7
+        """
+        # equation (5): compute ratio of tokens allocated to each expert
+        # total number of tokens is defined as total tokens in batch * k
+        # (k = 1) for the Switch Transformer
+        with torch.no_grad():
+            one_hot_indices = F.one_hot(indices, num_classes=self.n_exp)  # [B, T, k, n_exp]
+            one_hot_indices = torch.sum(one_hot_indices.float(), dim=2)  # [B, T, n_exp] (sum over k dimension)
+            tokens_per_expert = torch.mean(one_hot_indices.float(), dim=(0, 1))
+        # equation (6): compute ratio of router probability allocated to each expert
+        prob_per_expert = torch.mean(expert_probs.float(), dim=(0, 1))
+        # equation (4): take a scaled dot product between prob/token allocation vectors
+        # multiply the result by the number of experts
+        return self.n_exp * torch.sum(prob_per_expert * tokens_per_expert)
+    def compute_router_z_loss(self, logits: torch.Tensor):
+        """
+        Computes ST-MoE router z loss (https://arxiv.org/abs/2202.08906)
+        See equation (5) on page 7
+        """
+        # exponentiate logits, sum logits of each expert, take log, and square
+        # code below is the same as:
+        # > z_loss = torch.exp(logits)
+        # > z_loss = torch.sum(z_loss, dim=-1)
+        # > z_loss = torch.log(z_loss) ** 2.0
+        z_loss = torch.logsumexp(logits, dim=-1) ** 2.0  # [B, T, n_exp]
+        # sum over all tokens and divide by total number of tokens
+        return torch.mean(z_loss)
+    def get_capacity(self, tokens_per_batch):
+        # expert capacity is given by (tokens_per_batch / num_experts) * capacity_factor
+        # see eq (3) in Switch Transformer (https://arxiv.org/abs/2101.03961)
+        capacity_factor = self.train_capacity if self.training else self.eval_capacity
+        capacity = math.floor(self.top_k * capacity_factor * tokens_per_batch / self.n_exp)
+        capacity += capacity % 2
+        capacity = max(capacity, self.min_capacity)
+        assert capacity > 0
+        return int(capacity)
+# FEEDFORWARD
+class MLP(nn.Module):
+    def __init__(self, config, ffn_dim=None):
+        super().__init__()
+        if ffn_dim==None:
+            ffn_dim = config.ffn_dim
+        self.fc1 = nn.Linear(config.n_embd, 2 * ffn_dim, bias=config.bias)
+        self.fc1.is_swiglu = True
+        self.swish = nn.SiLU()
+        self.fc2 = nn.Linear(ffn_dim, config.n_embd, bias=config.bias)
+        self.fc2.output_proj_marker = True
+        self.dropout1 = nn.Dropout(config.dropout)
+        self.dropout2 = nn.Dropout(config.dropout)
+        # nn.init.xavier_uniform_(self.fc1.weight, gain=math.sqrt(2.0))
+        # nn.init.xavier_uniform_(self.fc2.weight, gain=1.0)
+    def forward(self, x):
+        x = self.fc1(x)
+        # Inline SwiGLU: Split the doubled dimension and apply gate
+        x, gate = x.chunk(2, dim=-1)
+        x = x * self.swish(gate)
+        x = self.dropout1(x)
+        x = self.fc2(x)
+        return self.dropout2(x)
+class MLPExperts(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_exp = config.n_exp
+        self.n_embd = config.n_embd
+        self.bias = config.bias
+        self.c_fc = nn.Parameter(torch.empty(self.n_exp, self.n_embd, 2 * config.expert_dim))
+        self.c_proj = nn.Parameter(torch.empty(self.n_exp, config.expert_dim, self.n_embd))
+        self.swish = nn.SiLU()
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        x = torch.bmm(x, self.c_fc)
+        x, gate = x.chunk(2, dim=-1)
+        x = x * self.swish(gate)
+        x = torch.bmm(x, self.c_proj)
+        return self.dropout(x)
+class MOELayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.router = Router(config) # (noisy) top k router
+        self.experts = MLPExperts(config) # group of MLPs (experts)
+        self.shared_expert = MLP(config, ffn_dim=config.shared_dim)
+    def forward(self, x: torch.Tensor):
+        B, T, n_embd = x.size()
+        num_tokens = (B * T)
+        shared_out = self.shared_expert(x)
+        used_capacity, exp_weight, exp_mask = self.router(x)
+        x = x.view(num_tokens, n_embd)
+        # [n_exp, exp_capacity, B * T] * [B * T, n_embd] -> [n_exp, exp_capacity, n_embd]
+        exp_batches = exp_mask.permute(1, 2, 0).type_as(x) @ x
+        exp_out = self.experts(exp_batches) # [n_exp, exp_capacity, n_embd]
+        # aggregate expert outputs based on router weights
+        # eq (2) on page 4 of ST-MoE (https://arxiv.org/abs/2202.08906)
+        # similar equations are used for other MoE papers
+        exp_weight = exp_weight.view(num_tokens, -1) # [B * T, n_exp * exp_capacity]
+        exp_out = exp_out.view(-1, n_embd) # [n_exp * exp_capacity, n_embd]
+        output = exp_weight @ exp_out # [B * T, n_embd]
+        moe_out = output.view(B, T, n_embd)
+        return moe_out + shared_out
+class Block(nn.Module):
+    def __init__(self, config, use_moe=False):
+        super().__init__()
+        self.ln_1 = nn.RMSNorm(config.n_embd)
+        self.attn = MultiHeadLatentAttention(config)
+        self.ln_2 = nn.RMSNorm(config.n_embd)
+        if use_moe:
+            self.mlp = MOELayer(config)
+        else:
+            self.mlp = MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+@dataclass
+class GPTConfig:
+    block_size: int = 2048
+    vocab_size: int = 50304
+    n_layer: int = 24
+    n_head: int = 10
+    n_embd: int = 640
+    dropout: float = 0.0
+    ffn_dim: int = 640*4
+    bias: bool = False
+    # MLA - High Efficiency
+    d_c: int = 192
+    d_c1: int = 192
+    d_rotate: int = 64
+    theta: float = 10000.0
+    # MoE - Maximally Smart
+    n_exp: int = 12
+    top_k: int = 3
+    expert_dim: int = 640
+    shared_dim: int = 640
+    stride: int = 2
+    # Stability (Standard Production Settings)
+    use_aux_loss: bool = True
+    use_router_z_loss: bool = True
+    use_noisy_top_k: bool = True
+    aux_loss_weight: float = 0.01
+    router_z_loss_weight: float = 0.001
+    train_capacity: float = 1.25
+    eval_capacity: float = 2.0
+    min_capacity: int = 4
+    use_switch_tfm_init: bool = True
+    switch_tfm_init_scale: float = 1.0
+    router_use_full_prec: bool = True
+    # Training Hyperparameters
+    batch_size: int = 8
+    grad_acc: int = 128
+    num_train_epochs: int = 1
+    learning_rate: float = 3e-4
+    weight_decay: float = 0.1
+    betas: tuple = (0.9, 0.95)
+    warm_up: int = 5000
+    eos_token_id = 0
+    bos_token_id = 0
+    pad_token_id = 0
+class HybridOptimizer(torch.optim.Optimizer):
+        def __init__(self, optimizers):
+            self.optimizers = optimizers
+            self.param_groups = []
+            for opt in self.optimizers:
+                self.param_groups.extend(opt.param_groups)
+        def step(self, closure=None):
+            loss = None
+            if closure is not None:
+                loss = closure()
+            for opt in self.optimizers:
+                opt.step()
+            return loss
+        def zero_grad(self, set_to_none=True):
+            for opt in self.optimizers:
+                opt.zero_grad(set_to_none=set_to_none)
+        def state_dict(self):
+            return [opt.state_dict() for opt in self.optimizers]
+        def load_state_dict(self, state_dict):
+            for opt, sd in zip(self.optimizers, state_dict):
+                opt.load_state_dict(sd)
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+        self.can_return_loss = True
+        self.accepts_loss_kwargs = False
+        if config.n_exp == 1:
+            blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
+        else:
+            blocks = []
+            for i in range(config.n_layer):
+                use_moe = False if (i < config.stride or i > config.n_layer - config.stride-1)  else True
+                blocks.append(Block(config, use_moe=use_moe))
+            blocks = nn.ModuleList(blocks)
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            h = blocks,
+            ln_f = nn.RMSNorm(config.n_embd),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.transformer.wte.weight = self.lm_head.weight
+        self.apply(self._init_weights)
+        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
+    def get_num_params(self, non_embedding=True):
+        n_params = sum(p.numel() for p in self.parameters())
+        return n_params
+    @torch.no_grad()
+    def _init_weights(self, module):
+        # Setup base configuration
+        scale = self.config.switch_tfm_init_scale if hasattr(self.config, 'switch_tfm_init_scale') else 1.0
+        n_layer = self.config.n_layer
+        if isinstance(module, nn.Linear):
+            # Calculate standard fan-in (input dimension)
+            w_fan_in = module.weight.shape[-1]
+            base_std = (scale / w_fan_in) ** 0.5
+            # Determine specific scaling per layer type
+            if hasattr(module, 'router_marker'):
+                # Small std for routers ensures balanced initial expert distribution
+                final_std = 0.01
+            elif hasattr(module, 'output_proj_marker'):
+                # Residual scaling: keeps variance from exploding in deep networks
+                final_std = base_std / math.sqrt(2 * n_layer)
+            elif hasattr(module, 'is_attention'):
+                # Attn weights often benefit from a slight dampener
+                final_std = base_std * 0.7
+            else:
+                # Standard hidden/up-projections
+                final_std = base_std
+            # Apply truncated normal initialization
+            torch.nn.init.trunc_normal_(
+                module.weight, mean=0.0, std=final_std, a=-2*final_std, b=2*final_std
+            )
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        # Handling custom Parameter-based MLPExperts
+        elif isinstance(module, MLPExperts):
+            # UP-PROJECTION (c_fc)
+            c_fc_fan_in = module.c_fc.shape[-2]
+            final_fc_std = (scale / c_fc_fan_in) ** 0.5
+            torch.nn.init.trunc_normal_(module.c_fc, std=final_fc_std, a=-2*final_fc_std, b=2*final_fc_std)
+            # DOWN-PROJECTION (c_proj)
+            c_proj_fan_in = module.c_proj.shape[-2]
+            # Residual scaling for MoE outputs
+            final_proj_std = ((scale / c_proj_fan_in) ** 0.5) / math.sqrt(2 * n_layer)
+            torch.nn.init.trunc_normal_(module.c_proj, std=final_proj_std, a=-2*final_proj_std, b=2*final_proj_std)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+        # elif isinstance(module, nn.RMSNorm):
+        #     # Initializing to 0.01 as requested
+        #     # Note: 1.0 is standard, 0.01 will significantly dampen initial signal
+        #     torch.nn.init.constant_(module.weight, 1.0)
+    def forward(self, input_ids, labels=None, attention_mask=None, **kwargs):
+        _, t = input_ids.size()
+        assert t <= self.config.block_size, f"Sequence length {t} exceeds block size {self.config.block_size}"
+        x = self.transformer.wte(input_ids)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        if labels is not None:
+            logits = self.lm_head(x)
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            # print("\n\nlabel: ", shift_labels, "\ninput: ", input_ids)
+            loss_fct = nn.CrossEntropyLoss(
+                ignore_index=-100,
+                label_smoothing=0.1,
+                reduction='mean'
+            )
+            main_loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1)
+            )
+            loss = main_loss
+            if self.config.n_exp > 1:
+                if self.config.use_aux_loss:
+                    loss += self.config.aux_loss_weight * MANAGER.aggregate_aux_loss()
+                    MANAGER.reset_aux_loss()
+                if self.config.use_router_z_loss:
+                    loss += self.config.router_z_loss_weight * MANAGER.aggregate_router_z_loss()
+                    MANAGER.reset_router_z_loss()
+        else:
+            logits = self.lm_head(x[:, [-1], :])
+            loss = None
+        return CausalLMOutput(loss=loss, logits=logits)
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # TODO: add expert config
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        # add an extra check for "bias" string to account for bias terms in MoE layers
+        decay_params = [p for n, p in param_dict.items() if (p.dim() >= 2 and not n.endswith('bias'))]
+        nodecay_params = [p for n, p in param_dict.items() if (p.dim() < 2 or n.endswith('bias'))]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+        print(f"using fused AdamW: {use_fused}")
+        return optimizer
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        for _ in range(max_new_tokens):
+            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
+            # Correctly unpack the dataclass output
+            outputs = self(idx_cond)
+            logits = outputs.logits[:, -1, :] / temperature
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx

full_diagnostics.png ADDED Viewed

Git LFS Details

SHA256: 138d2ab19d71b3409861ca53a592139bc2abf00bdd81d159b5480b940f84c73c
Pointer size: 131 Bytes
Size of remote file: 706 kB

load.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from huggingface_hub import login, upload_folder
+login()
+upload_folder(folder_path=".", repo_id="firdavsus/LLM_D4", repo_type="model")

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f0add8314988fa54def48bd806d136a4c2fd890195571b0937efafadfa56b61
+size 1027863691

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:671e16a44344b218958d6bea35956ab658a5cf6e1df653fcd89ba32555fea3fa
+size 513935755

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d652bae1a3c19ced43ea6aa1b59b7afd879cb5f065bc818a85dfcd440c8e7d85
+size 14645

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85b4acf0da0ea190f06b9e0da812d798a2d827423827367c72b771685bde7cee
+size 1465

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": "<|endoftext|>"
+}

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc1189daa1d8edc864213640a8c99b39c6c20b57d961e66a0efd1b89958b652f
+size 5841

training_curves_with_eval.png ADDED Viewed

Git LFS Details

SHA256: dd4f439debee6d335e719327b3b630981875fc4a2ca232a61653bf16a89c9c7e
Pointer size: 131 Bytes
Size of remote file: 267 kB

weight_histograms.png ADDED Viewed

Git LFS Details

SHA256: 45df541c5eba2a1651eebaf31521dd77d8d73cbc465873a5826a0c7227d9c6f3
Pointer size: 131 Bytes
Size of remote file: 422 kB