talphaidze
/

1920gptcd

PyTorch

moegpt

custom_code

Model card Files Files and versions

xet

Community

talphaidze commited on Jun 20, 2025

Commit

1b8457b

verified ·

1 Parent(s): 898c171

Update modeling.py

Browse files

Files changed (1) hide show

modeling.py +396 -103

modeling.py CHANGED Viewed

@@ -1,142 +1,420 @@
 from transformers import PreTrainedModel
-from .configuration import MoLMConfig
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from transformers.utils import ModelOutput
-from .gpt import GPTBase
-from typing import Optional, List
-from dataclasses import dataclass
 @dataclass
 class Output(ModelOutput):
     logits: torch.FloatTensor = None
     loss: Optional[torch.FloatTensor] = None
-    expert_losses: Optional[List] = None
-    loss_to_log: Optional[float] = None
-class MoLM(PreTrainedModel):
-    config_class = MoLMConfig
-    def __init__(self, config, expert_weights=None, dropout=0.1, use_router=False):
-        """
-        Constructor for the MoLM (Mixture of Language Models) class.
-        :param config: The configuration of the model (should be a PretrainedConfig object)
-        :param expert_weights: (Optional) A list of weights for each expert to load pre-trained weights (should match the number of experts)
-        :param dropout: Dropout rate for the model
-        :param use_router: Flag to indicate whether to use routing (currently not implemented)
-        """
-        super(MoLM, self).__init__(config)
-        # Number of experts
-        self.num_experts = config.num_experts
-        print(f"Number of experts: {self.num_experts}")
-        print(f"Expert configurations: {config.expert_configs}")
-        assert len(config.expert_configs) == self.num_experts, "Number of expert configurations must match num_experts in config."
-        self.expert_configs = config.expert_configs
-        # Flag for routing (not implemented yet)
-        self.use_router = use_router
-        # Initialize experts using the provided configurations
-        self.experts = nn.ModuleList([GPTBase(config=self.expert_configs[i]) for i in range(self.num_experts)])
-        # Load pre-trained weights if provided
-        if expert_weights is not None:
-            for i, expert in enumerate(self.experts):
-                expert.load_state_dict(expert_weights[i], strict=False)
-                expert.transformer.wte.weight = torch.nn.Parameter(expert.transformer.wte.weight.clone())
-                for param in expert.parameters():
-                    param.requires_grad = False
-    def forward(self, input_ids, attention_mask=None, targets=None, date=None, masking_enabled=True, **kwargs):
         """
-        Forward pass for the MoLM model, passing input through all experts and averaging their outputs.
-        :param input_ids: Input token IDs (batch_size, seq_len)
-        :param attention_mask: Attention mask (batch_size, seq_len)
-        :param targets: Target labels for calculating loss (batch_size, seq_len)
-        :param date: A tensor indicating which experts to use. Each sample in the batch can have a different date.
-        :param masking_enabled: Whether or not to perform expert masking (True/False)
-        :param kwargs: Additional arguments
-        :return: The averaged output of all active experts up to the specified date for each sample in the batch
         """
-        device = input_ids.device
-        b, t = input_ids.size()
-        # Ensure the sequence length doesn't exceed the configured block size
-        assert t <= self.config.sequence_length, f"Cannot forward sequence of length {t}, block size is only {self.config.sequence_length}"
-        # If date is None, set a default value (e.g., 6 for all samples)
         if date is None:
             date = torch.full((1, b), 6, dtype=torch.long, device=device).squeeze(0)
-        elif isinstance(date, int):
-            # If date is an integer, set it for all samples in the batch
             date = (date - 2013) // 2 + 1
             date = torch.full((1, b), date, dtype=torch.long, device=device).squeeze(0)
-        elif isinstance(date, torch.Tensor):
-            # Ensure the tensor has the correct shape (batch_size,)
-            assert date.size(0) == b, "The size of date tensor must match the batch size."
-            date = date.to(device)
-        # Get outputs from each expert
-        expert_outputs = []
-        expert_losses = []
-        # Track the number of active experts for each sample in the batch
-        active_experts_count = torch.zeros(b, dtype=torch.long, device=device)
-        # Pass input through each expert
-        for i, expert in enumerate(self.experts):
-            # Masking logic based on date (for each sample in the batch)
-            expert_mask = date >= i  # Mask experts where date < i (i.e., deactivate them)
-            # Expand the expert_mask to match the logits shape (batch_size, 1, 1)
-            expert_mask_expanded = expert_mask.unsqueeze(-1).unsqueeze(-1).float()
-            expert_output = expert(input_ids, targets=targets, date=date, get_logits=True, **kwargs)
-            logits = expert_output["logits"]
-            loss_to_log = expert_output["loss_to_log"]
-            # Mask out the outputs for deactivated experts
-            logits = logits * expert_mask_expanded  # Apply the mask (zero out logits for inactive experts)
-            # Only append logits from active experts
-            expert_outputs.append(logits)
-            expert_losses.append(loss_to_log)
-            # Update active expert count for each sample
-            active_experts_count += expert_mask.long()  # Ensure type consistency by converting `expert_mask` to Long
-        # Stack the logits and calculate the mean for each sample across the active experts
-        expert_outputs = torch.stack(expert_outputs, dim=0)  # Shape: (num_experts, batch_size, seq_len, vocab_size)
-        # Calculate the sum across the active experts for each sample and then average
-        summed_logits = torch.sum(expert_outputs, dim=0)  # Sum across active experts
-        combined_logits = summed_logits / active_experts_count.unsqueeze(-1).unsqueeze(-1)  # Divide by the number of active experts
-        # Calculate the loss if targets are provided
-        if targets is not None:
-            loss = F.cross_entropy(combined_logits.view(-1, combined_logits.size(-1)), targets.view(-1), ignore_index=-1)
-            loss_to_log = loss.item()
-        else:
-            loss = None
-            loss_to_log = None
-        return Output(
-            logits=combined_logits,
-            loss=loss,
-            loss_to_log=loss_to_log,
-            expert_losses=expert_losses
         )
     @torch.no_grad()
-    def generate(self, input_ids, max_new_tokens, date=None, temperature=1.0, top_k=None):
         """
         Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
         the sequence max_new_tokens times, feeding the predictions back into the model each time.
@@ -165,13 +443,13 @@ class MoLM(PreTrainedModel):
             # append sampled index to the running sequence and continue
             idx = torch.cat((idx, idx_next), dim=1)
             # check if we hit the end of the sequence
-            if idx_next.item() == 50526:
                 break
         return idx
     @torch.no_grad()
-    def generate_from_string(self, in_str, max_new_tokens, date=None, temperature=1.0, top_k=None):
         idx = (
             torch.tensor(
                 self.tokenizer.encode(in_str, allowed_special={"<|endoftext|>"})
@@ -185,4 +463,19 @@ class MoLM(PreTrainedModel):
             .to("cpu")
             .numpy()
         )
-        return self.tokenizer.decode(out_idx)

 from transformers import PreTrainedModel
+from configuration import MoEGPTConfig
+# importa anche MoE, MaskedMoE, TimeDependantMoE ecc.
+import math
+import inspect
+from typing import Optional, Dict, Any
+from dataclasses import dataclass
+import tiktoken
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
+from huggingface_hub import PyTorchModelHubMixin
 from transformers.utils import ModelOutput
+from moe import (
+    #ExpertChoiceMoE,
+    MaskedMoE,
+    TimeDependantMoE,
+    MoE,
+)
+from aux_losses import (
+    entropy_reg,
+    load_balancing_loss,
+    router_z_loss,
+)
+# class Output(ModelOutput):
+#     def __init__(self, logits, loss=None, aux_losses=None, router_logits=None):
+#         self.logits = logits
+#         self.loss = loss
+#         self.aux_losses = aux_losses
+#         self.router_logits = router_logits
 @dataclass
 class Output(ModelOutput):
     logits: torch.FloatTensor = None
     loss: Optional[torch.FloatTensor] = None
+    aux_losses: Optional[Dict[str, torch.FloatTensor]] = None
+    router_logits: Optional[torch.FloatTensor] = None
+    def __repr__(self):
+        return f"Output(logits={self.logits}, loss={self.loss}, aux_losses={self.aux_losses}, router_logits={self.router_logits})"
+class LayerNorm(nn.Module):
+    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
+        self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
+        if not self.flash:
+            print(
+                "WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0"
+            )
+            # causal mask to ensure that attention is only applied to the left in the input sequence
+            self.register_buffer(
+                "bias",
+                torch.tril(
+                    torch.ones(config.sequence_length, config.sequence_length)
+                ).view(1, 1, config.sequence_length, config.sequence_length),
+            )
+    def forward(self, x):
+        # batch size, sequence length, embedding dimensionality (n_embd)
+        (
+            B,
+            T,
+            C,
+        ) = x.size()
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        # (B, T, nh, hs)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        if self.flash:
+            # efficient attention using Flash Attention CUDA kernels
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True
+            )
+        else:
+            # manual implementation of attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = (
+            y.transpose(1, 2).contiguous().view(B, T, C)
+        )  # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dim_exp_factor = int(config.mlp_dim_exp_factor * 4)
+        self.c_fc = nn.Linear(
+            config.n_embd, self.dim_exp_factor * config.n_embd, bias=config.bias
+        )
+        self.c_proj = nn.Linear(
+            self.dim_exp_factor * config.n_embd, config.n_embd, bias=config.bias
+        )
+        self.dropout = nn.Dropout(config.dropout)
+        self.activation = nn.GELU()
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.activation(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        # need to return same type as the MoE block, but in this case it's empty
+        return x, {}
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.moe_config = config.moe_routing
+        if config.moe:
+            if config.moe_routing == "standard_gating":
+                self.mlp = MoE(config, MLP)
+            elif config.moe_routing == "masked":
+                self.mlp = TimeDependantMoE(config, MLP)
+            #elif config.moe_routing == "expert_choice":
+            #    self.mlp = ExpertChoiceMoE(config, MLP)
+            else:
+                raise ValueError(f"Unknown routing: {config.routing}")
+        else:
+            self.mlp = MLP(config)
+    def forward(self, x, date, *args, **kwargs):
+        x = x + self.attn(self.ln_1(x, *args, **kwargs))
+        if self.moe_config  == "masked":
+            x_, logits_and_experts = self.mlp(self.ln_2(x, *args, **kwargs), date)
+        else:
+            x_, logits_and_experts = self.mlp(self.ln_2(x, *args, **kwargs))
+        x = x + x_
+        return x, logits_and_experts
+class MoEGPTForCausalLM(PreTrainedModel):
+    config_class = MoEGPTConfig
+    def __init__(self, config):
+        super().__init__(config)
+        assert config.vocab_size is not None
+        assert config.sequence_length is not None
+        self.config = config
+        self.tokenizer = tiktoken.get_encoding("gpt2")
+        self.base_model_prefix = "timoe"
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.vocab_size, config.n_embd),
+                wpe=nn.Embedding(config.sequence_length, config.n_embd),
+                drop=nn.Dropout(config.dropout),
+                h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+                ln_f=LayerNorm(config.n_embd, bias=config.bias),
+            )
+        )
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # with weight tying when using torch.compile() some warnings get generated:
+        # "UserWarning: functional_call was passed multiple values for tied weights.
+        # This behavior is deprecated and will be an error in future versions"
+        # not 100% sure what this is, so far seems to be harmless. TODO investigate
+        self.transformer.wte.weight = (
+            self.lm_head.weight
+        )  # https://paperswithcode.com/method/weight-tying
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith("c_proj.weight"):
+                torch.nn.init.normal_(
+                    p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer)
+                )
+            if pn.endswith("router.weight"):
+                # special scaled init to moe router?
+                with torch.no_grad():
+                    dim = 1 if config.moe_routing == "standard_gating" else 0
+                    std = p.std()
+                    p.div_(p.sum(dim=dim, keepdim=True))
+                    p.mul_(std / p.std())
+    def get_router_losses(self, logits, selected_experts, eval=False):
+        # logits: (b * seq_len, n_experts)
+        # selected_experts: (b * seq_len, topk)
+        if eval:  # eval mode, compute all losses
+            return {
+                "moe_entropy_loss": entropy_reg(logits),
+                "moe_aux_loss": load_balancing_loss(logits, selected_experts),
+                "moe_z_loss": router_z_loss(logits),
+            }
+        if self.config.moe_router_loss == "entropy":
+            return {
+                "moe_entropy_loss": entropy_reg(logits),
+            }
+        elif self.config.moe_router_loss == "load_balancing_only":
+            return {
+                "moe_aux_loss": load_balancing_loss(logits, selected_experts),
+            }
+        elif self.config.moe_router_loss == "load_balancing_z_loss":
+            return {
+                "moe_aux_loss": load_balancing_loss(logits, selected_experts),
+                "moe_z_loss": router_z_loss(logits),
+            }
+        return {}
+    def get_num_params(self, non_embedding=True):
         """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
         """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, date=None, targets=None, attention_mask=None, get_logits=True, moe=False):
+        device = idx.device
+        b, t = idx.size()
+        assert (
+            t <= self.config.sequence_length
+        ), f"Cannot forward sequence of length {t}, block size is only {self.config.sequence_length}"
+        # shape (1, t)
         if date is None:
+            # set all the date to 6
             date = torch.full((1, b), 6, dtype=torch.long, device=device).squeeze(0)
+        else:
             date = (date - 2013) // 2 + 1
             date = torch.full((1, b), date, dtype=torch.long, device=device).squeeze(0)
+        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
+        # forward the GPT model itself
+        tok_emb = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        pos_emb = self.transformer.wpe(
+            pos
+        )  # position embeddings of shape (1, t, n_embd)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        # router logits is a list for each layer's routing, each of shape (b * seq_len, n_experts)
+        router_logits = []
+        # experts is a list for each layer's selected experts, shape (b * seq_len, topk)
+        experts = []
+        # forward pass through all the transformer blocks
+        for block in self.transformer.h:
+            x, logits_and_experts = block(x, date)
+            if len(logits_and_experts) > 0:
+                router_logits.append(logits_and_experts["router_logits"])
+                experts.append(logits_and_experts["selected_experts"])
+        x = self.transformer.ln_f(x)
+        # aux_losses is a dict with keys for different auxiliary losses
+        aux_losses = {}
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.lm_head(x)
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1
+            )
+            if moe and (self.config.moe_routing == "standard_gating" or self.config.moe_routing == "masked"):
+                # calculate the router losses per layer
+                for logit, expert_choice in zip(router_logits, experts):
+                    router_losses = self.get_router_losses(
+                        logit, expert_choice, eval=not self.training
+                    )
+                    for k, v in router_losses.items():
+                        aux_losses[k] = aux_losses.get(k, 0.0) + v
+                        if self.training:
+                            loss += (
+                                v
+                                * getattr(self.config, k + "_factor")
+                                / self.config.n_layer
+                            )
+        else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            logits = self.lm_head(
+                #x[:, [-1], :]
+                x
+            )  # note: using list [-1] to preserve the time dim
+            loss = None
+        logits = logits if get_logits else None
+        router_logits = (
+            torch.stack(router_logits, dim=0) if len(router_logits) > 0 else None
+        )
+        # return {
+        #     "logits": logits,
+        #     "loss": loss,
+        #     "aux_losses": aux_losses,
+        #     "router_logits": router_logits,
+        # }
+        return Output(logits = logits, loss = loss, aux_losses = aux_losses, router_logits = router_logits)
+    def crop_sequence_length(self, sequence_length):
+        # model surgery to decrease the block size if necessary
+        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
+        # but want to use a smaller block size for some smaller, simpler model
+        assert sequence_length <= self.config.sequence_length
+        self.config.sequence_length = sequence_length
+        self.transformer.wpe.weight = nn.Parameter(
+            self.transformer.wpe.weight[:sequence_length]
+        )
+        for block in self.transformer.h:
+            block.attn.bias = block.attn.bias[:, :, :sequence_length, :sequence_length]
+    def get_parameter_group_specs(self):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear,)
+        BLACKLIST_WEIGHT_MODULES = (
+            torch.nn.LayerNorm,
+            LayerNorm,
+            torch.nn.Embedding,
+        )
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = "%s.%s" % (mn, pn) if mn else pn  # full param name
+                # random note: because named_modules and named_parameters are recursive
+                # we will see the same tensors p many many times. but doing it this way
+                # allows us to know which parent module any tensor p belongs to...
+                if pn.endswith("bias"):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith("weight") and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith("weight") and isinstance(m, BLACKLIST_WEIGHT_MODULES):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+        # subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
+        # will appear in the no_decay and decay sets respectively after the above.
+        # In addition, because named_parameters() doesn't return duplicates, it
+        # will only return the first occurence, key'd by 'transformer.wte.weight', below.
+        # so let's manually remove 'lm_head.weight' from decay set. This will include
+        # this tensor into optimization via transformer.wte.weight only, and not decayed.
+        decay.remove("lm_head.weight")
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert (
+            len(inter_params) == 0
+        ), "parameters %s made it into both decay/no_decay sets!" % (str(inter_params),)
+        assert (
+            len(param_dict.keys() - union_params) == 0
+        ), "parameters %s were not separated into either decay/no_decay set!" % (
+            str(param_dict.keys() - union_params),
         )
+        # create the pytorch optimizer object
+        return [
+            {"params": sorted(list(decay))},
+            {"params": sorted(list(no_decay)), "weight_decay": 0.0},
+        ]
     @torch.no_grad()
+    def generate(self, input_ids, max_new_tokens, date = None, temperature=1.0, top_k=None):
         """
         Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
         the sequence max_new_tokens times, feeding the predictions back into the model each time.
             # append sampled index to the running sequence and continue
             idx = torch.cat((idx, idx_next), dim=1)
             # check if we hit the end of the sequence
+            if idx_next.item() == self.tokenizer.eot_token:
                 break
         return idx
     @torch.no_grad()
+    def generate_from_string(self, in_str, max_new_tokens, date = None, temperature=1.0, top_k=None):
         idx = (
             torch.tensor(
                 self.tokenizer.encode(in_str, allowed_special={"<|endoftext|>"})
             .to("cpu")
             .numpy()
         )
+        return self.tokenizer.decode(out_idx).split(in_str)[-1]
+    def get_input_embeddings(self):
+        return self.transformer.wte
+    def set_input_embeddings(self, new_embeddings):
+        self.transformer.wte = new_embeddings
+        # reset the lm_head to use the new embeddings
+        # this is necessary because the lm_head is tied to the input embeddings
+        self.lm_head = nn.Linear(
+            self.config.n_embd, new_embeddings.weight.shape[0] , bias=False
+        )
+        #self.transformer.wte.weight = (
+        #    self.lm_head.weight
+        #)