robinfaro commited on May 13, 2025

Commit

2966e5f

verified ·

1 Parent(s): 91636eb

Upload custom config and model files

Browse files

Files changed (31) hide show

.gitattributes +2 -0
README.md +10 -0
__init__.py +2 -0
aux_losses.py +88 -0
config.json +89 -0
configuration.py +51 -0
merges.txt +0 -0
modeling.py +465 -0
moe.py +134 -0
special_tokens_map.json +5 -0
tokenizer.json +0 -0
tokenizer_config.json +20 -0
vocab.json +0 -0
wandb/debug-internal.log +8 -0
wandb/debug.log +22 -0
wandb/run-20250410_080613-kly9kjv7/files/config.yaml +41 -0
wandb/run-20250410_080613-kly9kjv7/files/output.log +76 -0
wandb/run-20250410_080613-kly9kjv7/files/requirements.txt +208 -0
wandb/run-20250410_080613-kly9kjv7/files/wandb-metadata.json +57 -0
wandb/run-20250410_080613-kly9kjv7/files/wandb-summary.json +1 -0
wandb/run-20250410_080613-kly9kjv7/logs/debug-core.log +14 -0
wandb/run-20250410_080613-kly9kjv7/logs/debug-internal.log +16 -0
wandb/run-20250410_080613-kly9kjv7/logs/debug.log +23 -0
wandb/run-20250410_080613-kly9kjv7/run-kly9kjv7.wandb +3 -0
wandb/run-20250410_080940-pqshro55/files/output.log +17 -0
wandb/run-20250410_080940-pqshro55/files/requirements.txt +208 -0
wandb/run-20250410_080940-pqshro55/files/wandb-metadata.json +57 -0
wandb/run-20250410_080940-pqshro55/logs/debug-core.log +8 -0
wandb/run-20250410_080940-pqshro55/logs/debug-internal.log +8 -0
wandb/run-20250410_080940-pqshro55/logs/debug.log +22 -0
wandb/run-20250410_080940-pqshro55/run-pqshro55.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wandb/run-20250410_080613-kly9kjv7/run-kly9kjv7.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20250410_080940-pqshro55/run-pqshro55.wandb filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+tags:
+- model_hub_mixin
+- pytorch_model_hub_mixin
+---
+This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
+- Code: [More Information Needed]
+- Paper: [More Information Needed]
+- Docs: [More Information Needed]

__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .configuration import MoEGPTConfig
2	+ from .modeling import MoEGPTForCausalLM

aux_losses.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def log_mean(x, dim):
+    return torch.logsumexp(x, dim=dim) - torch.log(
+        torch.tensor(x.shape[dim], dtype=torch.float32)
+    )
+def entropy_reg(logits: torch.Tensor, mean_over_batch: bool = True):
+    """Entropy regularization for the router."""
+    entropy_l = lambda l: -(l * l.exp()).sum(-1)
+    # softmax over experts
+    # logits: [batch_size * sequence_length, num_experts]
+    logprobs = F.log_softmax(logits, dim=-1)
+    if mean_over_batch:
+        # take mean probability over batch
+        logprobs = log_mean(logprobs, 0)
+    return -entropy_l(logprobs).mean()
+# two losses below are adapted from
+# https://github.com/google/flaxformer/blob/b725bd2a51d70e866d819c92de166fbf24425e6a/flaxformer/architectures/moe/routing.py
+def load_balancing_loss(logits: torch.Tensor, expert_indices: torch.Tensor) -> float:
+    """Computes auxiliary load balancing loss as in Switch Transformer.
+    See Switch Transformer (https://arxiv.org/abs/2101.03961). This function
+    implements the loss function presented in equations (4) - (6). It aims to
+    penalize those cases where the routing between experts is unbalanced.
+    Args:
+      logits: logits assigned to each expert per token. Shape:
+        <float32>[batch_size * sequence_length, num_experts].
+      expert_indices: <int>[batch_size * sequence_length, num_selected_experts]
+        indices identifying the top num_selected_experts for a given token.
+    Returns:
+      The auxiliary loss.
+    """
+    # num_token = batch_size * sequence_length
+    num_token, num_experts = logits.shape
+    # Shape: [batch_size * sequence_length, num_selected_experts, num_experts].
+    expert_mask = F.one_hot(expert_indices, num_experts)
+    # For a given token, determine if it was routed to a given expert.
+    # Shape: [batch_size * sequence_length, num_experts]
+    expert_mask, _ = torch.max(expert_mask, dim=-2)
+    # shape [num_experts]
+    tokens_per_expert = torch.mean(expert_mask, dim=0, dtype=torch.float32)
+    # compute router probability per expert in log space for numerical stability
+    logprobs = F.log_softmax(logits, dim=-1)
+    # take mean probability over batch
+    # shape [num_experts]
+    logprobs = log_mean(logprobs, dim=0)
+    router_prob_per_expert = torch.exp(logprobs)
+    return (
+        torch.mean(  # mean over experts
+            tokens_per_expert * router_prob_per_expert,
+            dtype=torch.float32,
+        )
+        * num_experts
+    )
+def router_z_loss(router_logits: torch.Tensor) -> float:
+    """Compute router z-loss.
+     The router z-loss was introduced in Designing Effective Sparse Expert Models
+     (https://arxiv.org/abs/2202.08906). It encourages router logits to remain
+     small in an effort to improve stability.
+    Args:
+      router_logits: <float>[batch_size * sequence_length, num_experts]
+        router logits
+    Returns:
+      Scalar router z-loss.
+    """
+    num_tokens, _ = router_logits.shape
+    log_z = torch.logsumexp(router_logits, dim=-1)
+    z_loss = log_z**2
+    return torch.sum(z_loss, dtype=torch.float32) / (num_tokens)

config.json ADDED Viewed

	@@ -0,0 +1,89 @@

+{
+  "return_dict": true,
+  "output_hidden_states": false,
+  "output_attentions": false,
+  "torchscript": false,
+  "torch_dtype": null,
+  "use_bfloat16": false,
+  "tf_legacy_loss": false,
+  "pruned_heads": {},
+  "tie_word_embeddings": true,
+  "chunk_size_feed_forward": 0,
+  "is_encoder_decoder": false,
+  "is_decoder": false,
+  "cross_attention_hidden_size": null,
+  "add_cross_attention": false,
+  "tie_encoder_decoder": false,
+  "max_length": 20,
+  "min_length": 0,
+  "do_sample": false,
+  "early_stopping": false,
+  "num_beams": 1,
+  "num_beam_groups": 1,
+  "diversity_penalty": 0.0,
+  "temperature": 1.0,
+  "top_k": 50,
+  "top_p": 1.0,
+  "typical_p": 1.0,
+  "repetition_penalty": 1.0,
+  "length_penalty": 1.0,
+  "no_repeat_ngram_size": 0,
+  "encoder_no_repeat_ngram_size": 0,
+  "bad_words_ids": null,
+  "num_return_sequences": 1,
+  "output_scores": false,
+  "return_dict_in_generate": false,
+  "forced_bos_token_id": null,
+  "forced_eos_token_id": null,
+  "remove_invalid_values": false,
+  "exponential_decay_length_penalty": null,
+  "suppress_tokens": null,
+  "begin_suppress_tokens": null,
+  "architectures": [
+    "MoEGPTForCausalLM"
+  ],
+  "finetuning_task": null,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1"
+  },
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1
+  },
+  "tokenizer_class": null,
+  "prefix": null,
+  "bos_token_id": null,
+  "pad_token_id": null,
+  "eos_token_id": null,
+  "sep_token_id": null,
+  "decoder_start_token_id": null,
+  "task_specific_params": null,
+  "problem_type": null,
+  "_name_or_path": "",
+  "_attn_implementation_autoset": false,
+  "transformers_version": "4.51.0",
+  "batch_size": 16,
+  "vocab_size": 50304,
+  "n_embd": 768,
+  "n_layer": 12,
+  "n_head": 12,
+  "sequence_length": 1024,
+  "moe": true,
+  "moe_routing": "standard_gating",
+  "moe_num_experts": 6,
+  "moe_num_experts_per_tok": 2,
+  "moe_softmax_order": "softmax_topk",
+  "moe_router_loss": "load_balancing_z_loss",
+  "moe_aux_loss_factor": 0.01,
+  "moe_z_loss_factor": 1.0,
+  "mlp_dim_exp_factor": 1.0,
+  "dropout": 0.0,
+  "bias": false,
+  "auto_map": {
+    "AutoConfig": "configuration.MoEGPTConfig",
+    "AutoModelForCausalLM": "modeling.MoEGPTForCausalLM",
+    "AutoTokenizer": "GPT2TokenizerFast"
+  },
+  "model_type": "moegpt"
+}

configuration.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from transformers import PretrainedConfig
+class MoEGPTConfig(PretrainedConfig):
+    model_type = "moegpt"
+    def __init__(
+        self,
+        vocab_size=50304,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        sequence_length=1024,
+        moe=False,
+        moe_routing="standard_gating",
+        moe_num_experts=4,
+        moe_num_experts_per_tok=2,
+        moe_softmax_order="softmax_topk",
+        moe_router_loss="load_balancing_z_loss",
+        moe_aux_loss_factor=0.01,
+        moe_z_loss_factor=1.0,
+        mlp_dim_exp_factor=1.0,
+        dropout=0.0,
+        bias=False,
+        architectures=["MoEGPTForCausalLM"],
+        auto_map={
+            "AutoConfig": "configuration.MoEGPTConfig",
+            "AutoModelForCausalLM": "modeling.MoEGPTForCausalLM",
+            "AutoTokenizer": "GPT2TokenizerFast"
+        },
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.sequence_length = sequence_length
+        self.moe = moe
+        self.moe_routing = moe_routing
+        self.moe_num_experts = moe_num_experts
+        self.moe_num_experts_per_tok = moe_num_experts_per_tok
+        self.moe_softmax_order = moe_softmax_order
+        self.moe_router_loss = moe_router_loss
+        self.moe_aux_loss_factor = moe_aux_loss_factor
+        self.moe_z_loss_factor = moe_z_loss_factor
+        self.mlp_dim_exp_factor = mlp_dim_exp_factor
+        self.dropout = dropout
+        self.bias = bias
+        self.architectures = architectures
+        self.auto_map = auto_map

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling.py ADDED Viewed

	@@ -0,0 +1,465 @@

+from transformers import PreTrainedModel
+from .configuration import MoEGPTConfig
+# importa anche MoE, MaskedMoE, TimeDependantMoE ecc.
+import math
+import inspect
+from typing import Optional, Dict, Any
+from dataclasses import dataclass
+import tiktoken
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from huggingface_hub import PyTorchModelHubMixin
+from transformers.utils import ModelOutput
+from .moe import (
+    #ExpertChoiceMoE,
+    MaskedMoE,
+    TimeDependantMoE,
+    MoE,
+)
+from .aux_losses import (
+    entropy_reg,
+    load_balancing_loss,
+    router_z_loss,
+)
+# class Output(ModelOutput):
+#     def __init__(self, logits, loss=None, aux_losses=None, router_logits=None):
+#         self.logits = logits
+#         self.loss = loss
+#         self.aux_losses = aux_losses
+#         self.router_logits = router_logits
+@dataclass
+class Output(ModelOutput):
+    logits: torch.FloatTensor = None
+    loss: Optional[torch.FloatTensor] = None
+    aux_losses: Optional[Dict[str, torch.FloatTensor]] = None
+    router_logits: Optional[torch.FloatTensor] = None
+    def __repr__(self):
+        return f"Output(logits={self.logits}, loss={self.loss}, aux_losses={self.aux_losses}, router_logits={self.router_logits})"
+class LayerNorm(nn.Module):
+    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
+        self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
+        if not self.flash:
+            print(
+                "WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0"
+            )
+            # causal mask to ensure that attention is only applied to the left in the input sequence
+            self.register_buffer(
+                "bias",
+                torch.tril(
+                    torch.ones(config.sequence_length, config.sequence_length)
+                ).view(1, 1, config.sequence_length, config.sequence_length),
+            )
+    def forward(self, x):
+        # batch size, sequence length, embedding dimensionality (n_embd)
+        (
+            B,
+            T,
+            C,
+        ) = x.size()
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        # (B, T, nh, hs)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        if self.flash:
+            # efficient attention using Flash Attention CUDA kernels
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True
+            )
+        else:
+            # manual implementation of attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = (
+            y.transpose(1, 2).contiguous().view(B, T, C)
+        )  # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dim_exp_factor = int(config.mlp_dim_exp_factor * 4)
+        self.c_fc = nn.Linear(
+            config.n_embd, self.dim_exp_factor * config.n_embd, bias=config.bias
+        )
+        self.c_proj = nn.Linear(
+            self.dim_exp_factor * config.n_embd, config.n_embd, bias=config.bias
+        )
+        self.dropout = nn.Dropout(config.dropout)
+        self.activation = nn.GELU()
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.activation(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        # need to return same type as the MoE block, but in this case it's empty
+        return x, {}
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.moe_config = config.moe_routing
+        if config.moe:
+            if config.moe_routing == "standard_gating":
+                self.mlp = MoE(config, MLP)
+            elif config.moe_routing == "masked":
+                self.mlp = TimeDependantMoE(config, MLP)
+            #elif config.moe_routing == "expert_choice":
+            #    self.mlp = ExpertChoiceMoE(config, MLP)
+            else:
+                raise ValueError(f"Unknown routing: {config.routing}")
+        else:
+            self.mlp = MLP(config)
+    def forward(self, x, date, *args, **kwargs):
+        x = x + self.attn(self.ln_1(x, *args, **kwargs))
+        if self.moe_config  == "masked":
+            x_, logits_and_experts = self.mlp(self.ln_2(x, *args, **kwargs), date)
+        else:
+            x_, logits_and_experts = self.mlp(self.ln_2(x, *args, **kwargs))
+        x = x + x_
+        return x, logits_and_experts
+class MoEGPTForCausalLM(PreTrainedModel):
+    config_class = MoEGPTConfig
+    def __init__(self, config):
+        super().__init__(config)
+        assert config.vocab_size is not None
+        assert config.sequence_length is not None
+        self.config = config
+        self.tokenizer = tiktoken.get_encoding("gpt2")
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.vocab_size, config.n_embd),
+                wpe=nn.Embedding(config.sequence_length, config.n_embd),
+                drop=nn.Dropout(config.dropout),
+                h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+                ln_f=LayerNorm(config.n_embd, bias=config.bias),
+            )
+        )
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # with weight tying when using torch.compile() some warnings get generated:
+        # "UserWarning: functional_call was passed multiple values for tied weights.
+        # This behavior is deprecated and will be an error in future versions"
+        # not 100% sure what this is, so far seems to be harmless. TODO investigate
+        self.transformer.wte.weight = (
+            self.lm_head.weight
+        )  # https://paperswithcode.com/method/weight-tying
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith("c_proj.weight"):
+                torch.nn.init.normal_(
+                    p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer)
+                )
+            if pn.endswith("router.weight"):
+                # special scaled init to moe router?
+                with torch.no_grad():
+                    dim = 1 if config.moe_routing == "standard_gating" else 0
+                    std = p.std()
+                    p.div_(p.sum(dim=dim, keepdim=True))
+                    p.mul_(std / p.std())
+    def get_router_losses(self, logits, selected_experts, eval=False):
+        # logits: (b * seq_len, n_experts)
+        # selected_experts: (b * seq_len, topk)
+        if eval:  # eval mode, compute all losses
+            return {
+                "moe_entropy_loss": entropy_reg(logits),
+                "moe_aux_loss": load_balancing_loss(logits, selected_experts),
+                "moe_z_loss": router_z_loss(logits),
+            }
+        if self.config.moe_router_loss == "entropy":
+            return {
+                "moe_entropy_loss": entropy_reg(logits),
+            }
+        elif self.config.moe_router_loss == "load_balancing_only":
+            return {
+                "moe_aux_loss": load_balancing_loss(logits, selected_experts),
+            }
+        elif self.config.moe_router_loss == "load_balancing_z_loss":
+            return {
+                "moe_aux_loss": load_balancing_loss(logits, selected_experts),
+                "moe_z_loss": router_z_loss(logits),
+            }
+        return {}
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, date=None, targets=None, get_logits=True, moe=False):
+        device = idx.device
+        b, t = idx.size()
+        assert (
+            t <= self.config.sequence_length
+        ), f"Cannot forward sequence of length {t}, block size is only {self.config.sequence_length}"
+        # shape (1, t)
+        if date is None:
+            # set all the date to 6
+            date = torch.full((1, b), 6, dtype=torch.long, device=device).squeeze(0)
+        else:
+            date = (date - 2013) // 2 + 1
+            date = torch.full((1, b), date, dtype=torch.long, device=device).squeeze(0)
+        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
+        # forward the GPT model itself
+        tok_emb = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        pos_emb = self.transformer.wpe(
+            pos
+        )  # position embeddings of shape (1, t, n_embd)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        # router logits is a list for each layer's routing, each of shape (b * seq_len, n_experts)
+        router_logits = []
+        # experts is a list for each layer's selected experts, shape (b * seq_len, topk)
+        experts = []
+        # forward pass through all the transformer blocks
+        for block in self.transformer.h:
+            x, logits_and_experts = block(x, date)
+            if len(logits_and_experts) > 0:
+                router_logits.append(logits_and_experts["router_logits"])
+                experts.append(logits_and_experts["selected_experts"])
+        x = self.transformer.ln_f(x)
+        # aux_losses is a dict with keys for different auxiliary losses
+        aux_losses = {}
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.lm_head(x)
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1
+            )
+            if moe and (self.config.moe_routing == "standard_gating" or self.config.moe_routing == "masked"):
+                # calculate the router losses per layer
+                for logit, expert_choice in zip(router_logits, experts):
+                    router_losses = self.get_router_losses(
+                        logit, expert_choice, eval=not self.training
+                    )
+                    for k, v in router_losses.items():
+                        aux_losses[k] = aux_losses.get(k, 0.0) + v
+                        if self.training:
+                            loss += (
+                                v
+                                * getattr(self.config, k + "_factor")
+                                / self.config.n_layer
+                            )
+        else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            logits = self.lm_head(
+                #x[:, [-1], :]
+                x
+            )  # note: using list [-1] to preserve the time dim
+            loss = None
+        logits = logits if get_logits else None
+        router_logits = (
+            torch.stack(router_logits, dim=0) if len(router_logits) > 0 else None
+        )
+        # return {
+        #     "logits": logits,
+        #     "loss": loss,
+        #     "aux_losses": aux_losses,
+        #     "router_logits": router_logits,
+        # }
+        return Output(logits = logits, loss = loss, aux_losses = aux_losses, router_logits = router_logits)
+    def crop_sequence_length(self, sequence_length):
+        # model surgery to decrease the block size if necessary
+        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
+        # but want to use a smaller block size for some smaller, simpler model
+        assert sequence_length <= self.config.sequence_length
+        self.config.sequence_length = sequence_length
+        self.transformer.wpe.weight = nn.Parameter(
+            self.transformer.wpe.weight[:sequence_length]
+        )
+        for block in self.transformer.h:
+            block.attn.bias = block.attn.bias[:, :, :sequence_length, :sequence_length]
+    def get_parameter_group_specs(self):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear,)
+        BLACKLIST_WEIGHT_MODULES = (
+            torch.nn.LayerNorm,
+            LayerNorm,
+            torch.nn.Embedding,
+        )
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = "%s.%s" % (mn, pn) if mn else pn  # full param name
+                # random note: because named_modules and named_parameters are recursive
+                # we will see the same tensors p many many times. but doing it this way
+                # allows us to know which parent module any tensor p belongs to...
+                if pn.endswith("bias"):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith("weight") and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith("weight") and isinstance(m, BLACKLIST_WEIGHT_MODULES):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+        # subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
+        # will appear in the no_decay and decay sets respectively after the above.
+        # In addition, because named_parameters() doesn't return duplicates, it
+        # will only return the first occurence, key'd by 'transformer.wte.weight', below.
+        # so let's manually remove 'lm_head.weight' from decay set. This will include
+        # this tensor into optimization via transformer.wte.weight only, and not decayed.
+        decay.remove("lm_head.weight")
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert (
+            len(inter_params) == 0
+        ), "parameters %s made it into both decay/no_decay sets!" % (str(inter_params),)
+        assert (
+            len(param_dict.keys() - union_params) == 0
+        ), "parameters %s were not separated into either decay/no_decay set!" % (
+            str(param_dict.keys() - union_params),
+        )
+        # create the pytorch optimizer object
+        return [
+            {"params": sorted(list(decay))},
+            {"params": sorted(list(no_decay)), "weight_decay": 0.0},
+        ]
+    @torch.no_grad()
+    def generate(self, input_ids, max_new_tokens, date = None, temperature=1.0, top_k=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        idx = input_ids
+        for _ in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at sequence_length
+            idx_cond = (
+                idx
+                if idx.size(1) <= self.config.sequence_length
+                else idx[:, -self.config.sequence_length :]
+            )
+            # forward the model to get the logits for the index in the sequence
+            logits = self(idx_cond, date, get_logits=True).logits
+            # pluck the logits at the final step and scale by desired temperature
+            logits = logits[:, -1, :] / temperature
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("Inf")
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=1)
+            # check if we hit the end of the sequence
+            if idx_next.item() == self.tokenizer.eot_token:
+                break
+        return idx
+    @torch.no_grad()
+    def generate_from_string(self, in_str, max_new_tokens, date = None, temperature=1.0, top_k=None):
+        idx = (
+            torch.tensor(
+                self.tokenizer.encode(in_str, allowed_special={"<|endoftext|>"})
+            )
+            .view(1, -1)
+            .to(self.lm_head.weight.device)
+        )
+        out_idx = (
+            self.generate(idx, max_new_tokens, date, temperature, top_k)
+            .view(-1)
+            .to("cpu")
+            .numpy()
+        )
+        return self.tokenizer.decode(out_idx).split(in_str)[-1]

moe.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+Simple MoE routing implementations that replace the MLP block in a standard transformer.
+References:
+1) Mistral Source for Mixtral MoEs:
+https://github.com/mistralai/mistral-src
+2) ST-MoE:
+https://arxiv.org/abs/2202.08906
+3) Our notepad of MoE resources:
+https://docs.google.com/document/d/1NuQ5jr7V-Jv1ui7p4KrxO_JTz-7bpYcYMmh49EeJ-QA/edit?usp=sharing
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import bisect
+class MoE(nn.Module):
+    """
+    Simplest MoE implementation with a linear router and softmax over experts.
+    Note that in this implementation, we simply loop over the experts and
+    aggregate the results. This is not the most efficient way to do it, but
+    it also avoids the large memory overhead _and_ has no token dropping
+    (because we do not need the capacity factor).
+    """
+    def __init__(self, config, mlp):
+        super().__init__()
+        assert config.moe_num_experts > 0
+        self.experts = nn.ModuleList(
+            [mlp(config=config) for _ in range(config.moe_num_experts)]
+        )
+        self.router = nn.Linear(config.n_embd, config.moe_num_experts, bias=False)
+        self.top_k = config.moe_num_experts_per_tok
+        self.softmax_order = config.moe_softmax_order
+    def forward(self, inputs: torch.Tensor):
+        # [batch_size * sequence_length, n_embd]
+        inputs_squashed = inputs.view(-1, inputs.shape[-1])
+        # [batch_size * sequence_length, num_experts]
+        router_logits = self.router(inputs_squashed)
+        # note that selected experts will be the same for all orders:
+        # softmax doesnt change top-k, but the weights are different
+        if self.softmax_order == "softmax_topk":
+            all_probs = F.softmax(router_logits, dim=1)
+            weights, selected_experts = torch.topk(all_probs, self.top_k)
+        elif self.softmax_order == "topk_softmax":
+            weights, selected_experts = torch.topk(router_logits, self.top_k)
+            weights = F.softmax(weights, dim=-1)
+        else:
+            raise ValueError(f"Unknown softmax_order: {self.softmax_order}")
+        results = torch.zeros_like(inputs_squashed)
+        # naive looping over experts
+        for i, expert in enumerate(self.experts):
+            batch_idx, nth_expert = torch.where(selected_experts == i)
+            output, _ = expert(inputs_squashed[batch_idx])
+            results[batch_idx] += weights[batch_idx, nth_expert, None] * output
+        # return results and router logits (for aux loss calculation later)
+        return results.view_as(inputs), {
+            "router_logits": router_logits,
+            "selected_experts": selected_experts,
+        }
+class DummyExpert(nn.Module):
+    def __init__(self, output_size: int):
+        super().__init__()
+        self._output_size = output_size
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        out = torch.zeros((self._output_size,), device=inputs.device)
+        return out, {}
+class MaskedMoE(MoE):
+    def __init__(self, config, mlp):
+        super().__init__(config, mlp)
+        self._sequence_length = config.sequence_length
+        self.experts.append(DummyExpert(config.n_embd))
+        self.router = nn.Linear(config.n_embd, config.moe_num_experts+1, bias=False)
+    def forward(self, inputs: torch.Tensor, mask: torch.Tensor):
+        seq_len = inputs.shape[1]
+        inputs_squashed = inputs.view(-1, inputs.shape[-1])
+        router_logits = self.router(inputs_squashed)
+        mask = torch.cat(
+            (mask, torch.ones((mask.shape[0], 1), device=mask.device)),
+            dim=1
+        )
+        mask = mask.repeat_interleave(seq_len, dim=0)
+        router_logits = router_logits*mask
+        # note that selected experts will be the same for all orders:
+        # softmax doesnt change top-k, but the weights are different
+        if self.softmax_order == "softmax_topk":
+            all_probs = F.softmax(router_logits, dim=1)
+            weights, selected_experts = torch.topk(all_probs, self.top_k)
+        elif self.softmax_order == "topk_softmax":
+            weights, selected_experts = torch.topk(router_logits, self.top_k)
+            weights = F.softmax(weights, dim=-1)
+        else:
+            raise ValueError(f"Unknown softmax_order: {self.softmax_order}")
+        results = torch.zeros_like(inputs_squashed)
+        # naive looping over experts
+        for i, expert in enumerate(self.experts):
+            batch_idx, nth_expert = torch.where(selected_experts == i)
+            output, _ = expert(inputs_squashed[batch_idx])
+            results[batch_idx] += weights[batch_idx, nth_expert, None] * output
+        # return results and router logits (for aux loss calculation later)
+        return results.view_as(inputs), {
+            "router_logits": router_logits,
+            "selected_experts": selected_experts,
+        }
+class TimeDependantMoE(nn.Module):
+    def __init__(self, config, mlp):
+        super().__init__()
+        self._num_experts = config.moe_num_experts
+        self._mask_moe = MaskedMoE(config, mlp)
+    def forward(self, x, date):
+        mask_date = torch.zeros(x.shape[0], self._num_experts).to(x.device)
+        range_tensor = torch.arange(self._num_experts).unsqueeze(0).to(x.device)
+        mask_date = (range_tensor < date.unsqueeze(1)).float()
+        return self._mask_moe(x, mask_date)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1,8 @@

+{"time":"2025-04-10T08:09:40.545238228Z","level":"INFO","msg":"using version","core version":"0.19.1"}
+{"time":"2025-04-10T08:09:40.545267808Z","level":"INFO","msg":"created symlink","path":"/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-core.log"}
+{"time":"2025-04-10T08:09:40.655286034Z","level":"INFO","msg":"created new stream","id":"pqshro55"}
+{"time":"2025-04-10T08:09:40.655321987Z","level":"INFO","msg":"stream: started","id":"pqshro55"}
+{"time":"2025-04-10T08:09:40.655358183Z","level":"INFO","msg":"sender: started","stream_id":"pqshro55"}
+{"time":"2025-04-10T08:09:40.655363065Z","level":"INFO","msg":"writer: Do: started","stream_id":"pqshro55"}
+{"time":"2025-04-10T08:09:40.655386983Z","level":"INFO","msg":"handler: started","stream_id":"pqshro55"}
+{"time":"2025-04-10T08:09:40.92678924Z","level":"INFO","msg":"Starting system monitor"}

wandb/debug.log ADDED Viewed

	@@ -0,0 +1,22 @@

+2025-04-10 08:09:40,528 INFO    MainThread:18854 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
+2025-04-10 08:09:40,528 INFO    MainThread:18854 [wandb_setup.py:_flush():68] Configure stats pid to 18854
+2025-04-10 08:09:40,529 INFO    MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/.config/wandb/settings
+2025-04-10 08:09:40,530 INFO    MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/settings
+2025-04-10 08:09:40,530 INFO    MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from environment variables
+2025-04-10 08:09:40,530 INFO    MainThread:18854 [wandb_init.py:_log_setup():528] Logging user logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug.log
+2025-04-10 08:09:40,531 INFO    MainThread:18854 [wandb_init.py:_log_setup():529] Logging internal logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-internal.log
+2025-04-10 08:09:40,532 INFO    MainThread:18854 [wandb_init.py:init():644] calling init triggers
+2025-04-10 08:09:40,532 INFO    MainThread:18854 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
+config: {}
+2025-04-10 08:09:40,532 INFO    MainThread:18854 [wandb_init.py:init():680] starting backend
+2025-04-10 08:09:40,532 INFO    MainThread:18854 [wandb_init.py:init():684] sending inform_init request
+2025-04-10 08:09:40,538 INFO    MainThread:18854 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-04-10 08:09:40,539 INFO    MainThread:18854 [wandb_init.py:init():697] backend started and connected
+2025-04-10 08:09:40,540 INFO    MainThread:18854 [wandb_init.py:init():790] updated telemetry
+2025-04-10 08:09:40,553 INFO    MainThread:18854 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
+2025-04-10 08:09:40,912 INFO    MainThread:18854 [wandb_init.py:init():874] starting run threads in backend
+2025-04-10 08:09:41,234 INFO    MainThread:18854 [wandb_run.py:_console_start():2374] atexit reg
+2025-04-10 08:09:41,234 INFO    MainThread:18854 [wandb_run.py:_redirect():2224] redirect: wrap_raw
+2025-04-10 08:09:41,234 INFO    MainThread:18854 [wandb_run.py:_redirect():2289] Wrapping output streams.
+2025-04-10 08:09:41,235 INFO    MainThread:18854 [wandb_run.py:_redirect():2314] Redirects installed.
+2025-04-10 08:09:41,238 INFO    MainThread:18854 [wandb_init.py:init():916] run started, returning control to user process

wandb/run-20250410_080613-kly9kjv7/files/config.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+_wandb:
+    value:
+        cli_version: 0.19.1
+        m: []
+        python_version: 3.10.16
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 41
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+                - 100
+            "2":
+                - 1
+                - 5
+                - 11
+                - 41
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+                - 100
+            "3":
+                - 13
+                - 23
+                - 55
+            "4": 3.10.16
+            "5": 0.19.1
+            "6": 4.51.0
+            "8":
+                - 5
+            "12": 0.19.1
+            "13": linux-x86_64

wandb/run-20250410_080613-kly9kjv7/files/output.log ADDED Viewed

	@@ -0,0 +1,76 @@

+2025-04-10:08:06:24 INFO     [__main__:422] Selected Tasks: ['commonsense_qa', 'hellaswag', 'lambada', 'openbookqa', 'sciq']
+2025-04-10:08:06:24 INFO     [evaluator:180] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+2025-04-10:08:06:24 INFO     [evaluator:218] Initializing hf model, with arguments: {'pretrained': 'robinfaro/GPT2-1B-base', 'trust_remote_code': True, 'force_download': True}
+2025-04-10:08:06:24 INFO     [models.huggingface:136] Using device 'cuda:0'
+config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.95MB/s]
+configuration.py: 100%|██████████████████████████████████████████████████████████████████████████████| 1.65k/1.65k [00:00<00:00, 5.80MB/s]
+2025-04-10:08:06:25 INFO     [models.huggingface:504] Model type cannot be determined. Using default model type 'causal'
+2025-04-10:08:06:26 INFO     [models.huggingface:377] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
+config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 1.60MB/s]
+config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.83MB/s]
+configuration.py: 100%|██████████████████████████████████████████████████████████████████████████████| 1.65k/1.65k [00:00<00:00, 3.04MB/s]
+config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.81MB/s]
+modeling.py: 100%|███████████████████████████████████████████████████████████████████████████████████| 19.6k/19.6k [00:00<00:00, 31.7MB/s]
+aux_losses.py: 100%|█████████████████████████████████████████████████████████████████████████████████| 3.15k/3.15k [00:00<00:00, 6.84MB/s]
+moe.py: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5.39k/5.39k [00:00<00:00, 12.2MB/s]
+model.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████| 6.23G/6.23G [00:19<00:00, 316MB/s]
+Some weights of MoEGPTForCausalLM were not initialized from the model checkpoint at robinfaro/GPT2-1B-base and are newly initialized: ['transformer.wte.weight']
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+generation_config.json: 100%|███████████████████████████████████████████████████████████████████████████| 69.0/69.0 [00:00<00:00, 157kB/s]
+Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/commonsense_qa.py'
+2025-04-10:08:07:11 ERROR    [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/commonsense_qa.py'
+Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/.huggingface.yaml'
+2025-04-10:08:07:12 ERROR    [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/.huggingface.yaml'
+Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/dataset_infos.json'
+2025-04-10:08:07:13 ERROR    [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/dataset_infos.json'
+Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/openbookqa.py'
+2025-04-10:08:07:29 ERROR    [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/openbookqa.py'
+Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/.huggingface.yaml'
+2025-04-10:08:07:29 ERROR    [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/.huggingface.yaml'
+Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/dataset_infos.json'
+2025-04-10:08:07:32 ERROR    [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/dataset_infos.json'
+2025-04-10:08:07:38 INFO     [api.task:426] Building contexts for sciq on rank 0...
+100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 846.01it/s]
+2025-04-10:08:07:39 INFO     [api.task:426] Building contexts for openbookqa on rank 0...
+100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 2241.62it/s]
+2025-04-10:08:07:39 INFO     [api.task:426] Building contexts for lambada_openai on rank 0...
+100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5153/5153 [00:08<00:00, 614.90it/s]
+2025-04-10:08:07:48 INFO     [api.task:426] Building contexts for lambada_standard on rank 0...
+100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5153/5153 [00:08<00:00, 612.52it/s]
+2025-04-10:08:07:56 INFO     [api.task:426] Building contexts for hellaswag on rank 0...
+100%|█████████████████████████████████████████████████████████████████████████████████████████████| 10042/10042 [00:03<00:00, 2514.18it/s]
+2025-04-10:08:08:01 INFO     [api.task:426] Building contexts for commonsense_qa on rank 0...
+100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1221/1221 [00:02<00:00, 574.82it/s]
+2025-04-10:08:08:04 INFO     [evaluator:542] Running loglikelihood requests
+Traceback (most recent call last):
+  File "/mloscratch/homes/faro/conda/envs/thesis/bin/lm_eval", line 8, in <module>
+    sys.exit(cli_evaluate())
+  File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/__main__.py", line 432, in cli_evaluate
+    results = evaluator.simple_evaluate(
+  File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/utils.py", line 439, in _wrapper
+    return fn(*args, **kwargs)
+  File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/evaluator.py", line 333, in simple_evaluate
+    results = evaluate(
+  File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/utils.py", line 439, in _wrapper
+    return fn(*args, **kwargs)
+  File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/evaluator.py", line 553, in evaluate
+    resps = getattr(lm, reqtype)(cloned_reqs)
+  File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/api/model.py", line 378, in loglikelihood
+    context_enc, continuation_enc = self._encode_pair(context, continuation)
+  File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/api/model.py", line 359, in _encode_pair
+    context_enc = self.tok_encode(context)
+  File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/models/huggingface.py", line 811, in tok_encode
+    encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
+  File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2654, in encode
+    encoded_inputs = self.encode_plus(
+  File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3073, in encode_plus
+    return self._encode_plus(
+  File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py", line 126, in _encode_plus
+    return super()._encode_plus(*args, **kwargs)
+  File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py", line 613, in _encode_plus
+    batched_output = self._batch_encode_plus(
+  File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py", line 116, in _batch_encode_plus
+    return super()._batch_encode_plus(*args, **kwargs)
+  File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py", line 539, in _batch_encode_plus
+    encodings = self._tokenizer.encode_batch(
+KeyboardInterrupt

wandb/run-20250410_080613-kly9kjv7/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,208 @@

+wcwidth==0.2.13
+pure_eval==0.2.3
+ptyprocess==0.7.0
+traitlets==5.14.3
+tornado==6.4.1
+pyzmq==26.2.0
+Pygments==2.18.0
+psutil==6.0.0
+prompt_toolkit==3.0.47
+platformdirs==4.3.6
+pexpect==4.9.0
+parso==0.8.4
+nest-asyncio==1.6.0
+executing==2.1.0
+exceptiongroup==1.2.2
+decorator==5.1.1
+debugpy==1.8.5
+matplotlib-inline==0.1.7
+jupyter_core==5.7.2
+jedi==0.19.1
+comm==0.2.2
+asttokens==2.4.1
+stack-data==0.6.3
+jupyter_client==8.6.3
+ipython==8.27.0
+ipykernel==6.29.5
+mpmath==1.3.0
+MarkupSafe==2.1.5
+Jinja2==3.1.4
+wheel==0.45.1
+asttokens==3.0.0
+debugpy==1.8.13
+decorator==5.2.1
+exceptiongroup==1.2.2
+executing==2.1.0
+nest_asyncio==1.6.0
+packaging==24.2
+parso==0.8.4
+pickleshare==0.7.5
+platformdirs==4.3.6
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+Pygments==2.19.1
+setuptools==75.8.2
+six==1.17.0
+tornado==6.4.2
+traitlets==5.14.3
+typing_extensions==4.12.2
+wcwidth==0.2.13
+zipp==3.21.0
+comm==0.2.2
+importlib_metadata==8.6.1
+jedi==0.19.2
+jupyter_core==5.7.2
+matplotlib-inline==0.1.7
+pexpect==4.9.0
+pip==25.0.1
+prompt_toolkit==3.0.50
+python-dateutil==2.9.0.post0
+pyzmq==26.2.1
+stack_data==0.6.3
+ipython==8.33.0
+jupyter_client==8.6.3
+ipykernel==6.29.5
+pytz==2025.1
+lit==18.1.8
+xxhash==3.5.0
+urllib3==2.3.0
+tzdata==2025.1
+tqdm==4.67.1
+smmap==5.0.2
+setproctitle==1.3.5
+regex==2024.11.6
+PyYAML==6.0.2
+pydantic_core==2.27.2
+pyarrow==19.0.1
+protobuf==5.29.3
+propcache==0.3.0
+nvidia-nvtx-cu11==11.7.91
+nvidia-nccl-cu11==2.14.3
+nvidia-curand-cu11==10.2.10.91
+nvidia-cufft-cu11==10.9.0.58
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cublas-cu11==11.10.3.66
+numpy==1.26.4
+networkx==3.4.2
+multidict==6.1.0
+idna==3.10
+fsspec==2024.9.0
+frozenlist==1.5.0
+filelock==3.17.0
+docker-pycreds==0.4.0
+dill==0.3.8
+cmake==3.31.6
+click==8.1.8
+charset-normalizer==3.4.1
+certifi==2025.1.31
+attrs==25.1.0
+async-timeout==5.0.1
+annotated-types==0.7.0
+aiohappyeyeballs==2.4.8
+yarl==1.18.3
+sentry-sdk==2.22.0
+requests==2.32.3
+pydantic==2.10.6
+pandas==2.2.3
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cudnn-cu11==8.5.0.96
+multiprocess==0.70.16
+gitdb==4.0.12
+aiosignal==1.3.2
+tiktoken==0.8.0
+GitPython==3.1.44
+aiohttp==3.11.13
+wandb==0.19.1
+datasets==3.1.0
+nvidia-cusparse-cu11==11.7.5.86
+triton==3.2.0
+nvidia-cusparselt-cu12==0.6.2
+sympy==1.13.1
+nvidia-nvtx-cu12==12.4.127
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nccl-cu12==2.21.5
+nvidia-curand-cu12==10.3.5.147
+nvidia-cufft-cu12==11.2.1.3
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cusolver-cu12==11.6.1.9
+torch==2.6.0
+jmespath==1.0.1
+botocore==1.37.8
+s3transfer==0.11.4
+boto3==1.37.8
+asciitree==0.3.3
+numcodecs==0.13.1
+fasteners==0.19
+zarr==2.18.3
+widgetsnbextension==4.0.13
+jupyterlab_widgets==3.0.13
+ipywidgets==8.1.5
+pyparsing==3.2.2
+pillow==11.1.0
+kiwisolver==1.4.8
+fonttools==4.56.0
+cycler==0.12.1
+contourpy==1.3.1
+matplotlib==3.10.1
+safetensors==0.5.3
+torchvision==0.21.0
+timm==1.0.15
+word2number==1.1
+sqlitedict==2.1.0
+zstandard==0.23.0
+threadpoolctl==3.6.0
+tcolorpy==0.1.7
+tabulate==0.9.0
+scipy==1.15.2
+pybind11==2.13.6
+portalocker==3.1.1
+pathvalidate==3.2.3
+numexpr==2.10.2
+more-itertools==10.6.0
+lxml==5.3.2
+jsonlines==4.0.0
+joblib==1.4.2
+colorama==0.4.6
+chardet==5.2.0
+absl-py==2.2.2
+tqdm-multiprocess==0.0.11
+scikit-learn==1.6.1
+sacrebleu==2.5.1
+nltk==3.9.1
+mbstrdecoder==1.1.4
+huggingface-hub==0.30.1
+typepy==1.3.4
+tokenizers==0.21.1
+rouge_score==0.1.2
+transformers==4.51.0
+accelerate==1.6.0
+peft==0.15.1
+DataProperty==1.1.0
+tabledata==1.3.4
+evaluate==0.4.3
+pytablewriter==1.2.1
+lm_eval==0.4.8
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.2
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

wandb/run-20250410_080613-kly9kjv7/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "os":  "Linux-6.5.0-45-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.10.16",
+  "startedAt":  "2025-04-10T08:06:13.632140Z",
+  "args":  [
+    "--model",
+    "hf",
+    "--model_args",
+    "pretrained=robinfaro/GPT2-1B-base,trust_remote_code=True,force_download=True",
+    "--tasks",
+    "commonsense_qa,openbookqa,hellaswag,lambada,sciq",
+    "--device",
+    "cuda:0",
+    "--batch_size",
+    "32",
+    "--output_path",
+    "outputs/evaluation/base_GPT",
+    "--wandb_args",
+    "project=lm-evaluation,name=base_GPT_intial_weights",
+    "--log_samples"
+  ],
+  "program":  "/mloscratch/homes/faro/conda/envs/thesis/bin/lm_eval",
+  "git":  {
+    "remote":  "https://github.com/robinfaro/time-moe.git",
+    "commit":  "209a56c7746e576430987b33efaad3213c829355"
+  },
+  "email":  "robin.faro@epfl.ch",
+  "root":  "/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling",
+  "host":  "interact-0-0",
+  "executable":  "/mloscratch/homes/faro/conda/envs/thesis/bin/python3.10",
+  "cpu_count":  36,
+  "cpu_count_logical":  72,
+  "gpu":  "Tesla V100-SXM2-32GB",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "6399114346496",
+      "used":  "4521100476416"
+    }
+  },
+  "memory":  {
+    "total":  "404270809088"
+  },
+  "cpu":  {
+    "count":  36,
+    "countLogical":  72
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "Tesla V100-SXM2-32GB",
+      "memoryTotal":  "34359738368",
+      "cudaCores":  5120,
+      "architecture":  "Volta"
+    }
+  ],
+  "cudaVersion":  "12.4"
+}

wandb/run-20250410_080613-kly9kjv7/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb":{"runtime":128}}

wandb/run-20250410_080613-kly9kjv7/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,14 @@

+{"time":"2025-04-10T08:06:13.116700026Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp0n5clybj/port-17759.txt","pid":17759,"debug":false,"disable-analytics":false}
+{"time":"2025-04-10T08:06:13.116732345Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
+{"time":"2025-04-10T08:06:13.117250294Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":17759}
+{"time":"2025-04-10T08:06:13.117259873Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44099,"Zone":""}}
+{"time":"2025-04-10T08:06:13.301381011Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:41864"}
+{"time":"2025-04-10T08:06:13.632552814Z","level":"INFO","msg":"handleInformInit: received","streamId":"kly9kjv7","id":"127.0.0.1:41864"}
+{"time":"2025-04-10T08:06:13.753061117Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"kly9kjv7","id":"127.0.0.1:41864"}
+{"time":"2025-04-10T08:08:22.003469384Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:41864"}
+{"time":"2025-04-10T08:08:22.003813875Z","level":"INFO","msg":"server is shutting down"}
+{"time":"2025-04-10T08:08:22.003802722Z","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:41864"}
+{"time":"2025-04-10T08:08:22.003987056Z","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:41864"}
+{"time":"2025-04-10T08:08:22.938316834Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:41864"}
+{"time":"2025-04-10T08:08:22.938355013Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:41864"}
+{"time":"2025-04-10T08:08:22.938384806Z","level":"INFO","msg":"server is closed"}

wandb/run-20250410_080613-kly9kjv7/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,16 @@

+{"time":"2025-04-10T08:06:13.634947325Z","level":"INFO","msg":"using version","core version":"0.19.1"}
+{"time":"2025-04-10T08:06:13.634964986Z","level":"INFO","msg":"created symlink","path":"/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080613-kly9kjv7/logs/debug-core.log"}
+{"time":"2025-04-10T08:06:13.752969451Z","level":"INFO","msg":"created new stream","id":"kly9kjv7"}
+{"time":"2025-04-10T08:06:13.753045008Z","level":"INFO","msg":"stream: started","id":"kly9kjv7"}
+{"time":"2025-04-10T08:06:13.753098809Z","level":"INFO","msg":"handler: started","stream_id":"kly9kjv7"}
+{"time":"2025-04-10T08:06:13.753089038Z","level":"INFO","msg":"writer: Do: started","stream_id":"kly9kjv7"}
+{"time":"2025-04-10T08:06:13.75312827Z","level":"INFO","msg":"sender: started","stream_id":"kly9kjv7"}
+{"time":"2025-04-10T08:06:14.101026755Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-04-10T08:08:22.003825637Z","level":"INFO","msg":"stream: closing","id":"kly9kjv7"}
+{"time":"2025-04-10T08:08:22.003921768Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-04-10T08:08:22.004950039Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-04-10T08:08:22.715736266Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-04-10T08:08:22.936723989Z","level":"INFO","msg":"handler: closed","stream_id":"kly9kjv7"}
+{"time":"2025-04-10T08:08:22.93681582Z","level":"INFO","msg":"sender: closed","stream_id":"kly9kjv7"}
+{"time":"2025-04-10T08:08:22.936820808Z","level":"INFO","msg":"writer: Close: closed","stream_id":"kly9kjv7"}
+{"time":"2025-04-10T08:08:22.938157108Z","level":"INFO","msg":"stream: closed","id":"kly9kjv7"}

wandb/run-20250410_080613-kly9kjv7/logs/debug.log ADDED Viewed

	@@ -0,0 +1,23 @@

+2025-04-10 08:06:13,618 INFO    MainThread:17759 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
+2025-04-10 08:06:13,619 INFO    MainThread:17759 [wandb_setup.py:_flush():68] Configure stats pid to 17759
+2025-04-10 08:06:13,619 INFO    MainThread:17759 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/.config/wandb/settings
+2025-04-10 08:06:13,619 INFO    MainThread:17759 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/settings
+2025-04-10 08:06:13,619 INFO    MainThread:17759 [wandb_setup.py:_flush():68] Loading settings from environment variables
+2025-04-10 08:06:13,620 INFO    MainThread:17759 [wandb_init.py:_log_setup():528] Logging user logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080613-kly9kjv7/logs/debug.log
+2025-04-10 08:06:13,620 INFO    MainThread:17759 [wandb_init.py:_log_setup():529] Logging internal logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080613-kly9kjv7/logs/debug-internal.log
+2025-04-10 08:06:13,620 INFO    MainThread:17759 [wandb_init.py:init():644] calling init triggers
+2025-04-10 08:06:13,621 INFO    MainThread:17759 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
+config: {}
+2025-04-10 08:06:13,621 INFO    MainThread:17759 [wandb_init.py:init():680] starting backend
+2025-04-10 08:06:13,621 INFO    MainThread:17759 [wandb_init.py:init():684] sending inform_init request
+2025-04-10 08:06:13,630 INFO    MainThread:17759 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-04-10 08:06:13,631 INFO    MainThread:17759 [wandb_init.py:init():697] backend started and connected
+2025-04-10 08:06:13,634 INFO    MainThread:17759 [wandb_init.py:init():790] updated telemetry
+2025-04-10 08:06:13,651 INFO    MainThread:17759 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
+2025-04-10 08:06:14,086 INFO    MainThread:17759 [wandb_init.py:init():874] starting run threads in backend
+2025-04-10 08:06:14,446 INFO    MainThread:17759 [wandb_run.py:_console_start():2374] atexit reg
+2025-04-10 08:06:14,446 INFO    MainThread:17759 [wandb_run.py:_redirect():2224] redirect: wrap_raw
+2025-04-10 08:06:14,446 INFO    MainThread:17759 [wandb_run.py:_redirect():2289] Wrapping output streams.
+2025-04-10 08:06:14,447 INFO    MainThread:17759 [wandb_run.py:_redirect():2314] Redirects installed.
+2025-04-10 08:06:14,450 INFO    MainThread:17759 [wandb_init.py:init():916] run started, returning control to user process
+2025-04-10 08:08:22,004 WARNING MsgRouterThr:17759 [router.py:message_loop():75] message_loop has been closed

wandb/run-20250410_080613-kly9kjv7/run-kly9kjv7.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3470404bd4c37d163a54c1a86cb7beac6b443ebd05b979578d2951589ecbc317
+size 185481

wandb/run-20250410_080940-pqshro55/files/output.log ADDED Viewed

	@@ -0,0 +1,17 @@

+2025-04-10:08:09:50 INFO     [__main__:422] Selected Tasks: ['commonsense_qa', 'hellaswag', 'lambada', 'openbookqa', 'sciq']
+2025-04-10:08:09:50 INFO     [evaluator:180] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+2025-04-10:08:09:50 INFO     [evaluator:218] Initializing hf model, with arguments: {'pretrained': 'robinfaro/GPT2-1B-base', 'trust_remote_code': True, 'force_download': True}
+2025-04-10:08:09:50 INFO     [models.huggingface:136] Using device 'cuda:0'
+2025-04-10:08:09:51 INFO     [models.huggingface:504] Model type cannot be determined. Using default model type 'causal'
+2025-04-10:08:09:52 INFO     [models.huggingface:377] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
+config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.01MB/s]
+config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 1.73MB/s]
+configuration.py: 100%|██████████████████████████████████████████████████████████████████████████████| 1.65k/1.65k [00:00<00:00, 3.86MB/s]
+config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 1.80MB/s]
+modeling.py: 100%|███████████████████████████████████████████████████████████████████████████████████| 19.6k/19.6k [00:00<00:00, 32.6MB/s]
+moe.py: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5.39k/5.39k [00:00<00:00, 12.5MB/s]
+aux_losses.py: 100%|█████████████████████████████████████████████████████████████████████████████████| 3.15k/3.15k [00:00<00:00, 7.68MB/s]
+model.safetensors.index.json: 100%|██████████████████████████████████████████████████████████████████| 22.2k/22.2k [00:00<00:00, 15.7MB/s]
+model-00002-of-00002.safetensors: 100%|██████████████████████████████████████████████████████████████| 1.55G/1.55G [00:30<00:00, 50.3MB/s]
+model-00001-of-00002.safetensors:  29%|██████████████████▏                                           | 1.47G/5.00G [00:30<01:39, 35.5MB/s]
+model-00001-of-00002.safetensors:  83%|███████████████████████████████████████████████████▊          | 4.17G/5.00G [02:14<00:44, 18.5MB/s]

wandb/run-20250410_080940-pqshro55/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,208 @@

+wcwidth==0.2.13
+pure_eval==0.2.3
+ptyprocess==0.7.0
+traitlets==5.14.3
+tornado==6.4.1
+pyzmq==26.2.0
+Pygments==2.18.0
+psutil==6.0.0
+prompt_toolkit==3.0.47
+platformdirs==4.3.6
+pexpect==4.9.0
+parso==0.8.4
+nest-asyncio==1.6.0
+executing==2.1.0
+exceptiongroup==1.2.2
+decorator==5.1.1
+debugpy==1.8.5
+matplotlib-inline==0.1.7
+jupyter_core==5.7.2
+jedi==0.19.1
+comm==0.2.2
+asttokens==2.4.1
+stack-data==0.6.3
+jupyter_client==8.6.3
+ipython==8.27.0
+ipykernel==6.29.5
+mpmath==1.3.0
+MarkupSafe==2.1.5
+Jinja2==3.1.4
+wheel==0.45.1
+asttokens==3.0.0
+debugpy==1.8.13
+decorator==5.2.1
+exceptiongroup==1.2.2
+executing==2.1.0
+nest_asyncio==1.6.0
+packaging==24.2
+parso==0.8.4
+pickleshare==0.7.5
+platformdirs==4.3.6
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+Pygments==2.19.1
+setuptools==75.8.2
+six==1.17.0
+tornado==6.4.2
+traitlets==5.14.3
+typing_extensions==4.12.2
+wcwidth==0.2.13
+zipp==3.21.0
+comm==0.2.2
+importlib_metadata==8.6.1
+jedi==0.19.2
+jupyter_core==5.7.2
+matplotlib-inline==0.1.7
+pexpect==4.9.0
+pip==25.0.1
+prompt_toolkit==3.0.50
+python-dateutil==2.9.0.post0
+pyzmq==26.2.1
+stack_data==0.6.3
+ipython==8.33.0
+jupyter_client==8.6.3
+ipykernel==6.29.5
+pytz==2025.1
+lit==18.1.8
+xxhash==3.5.0
+urllib3==2.3.0
+tzdata==2025.1
+tqdm==4.67.1
+smmap==5.0.2
+setproctitle==1.3.5
+regex==2024.11.6
+PyYAML==6.0.2
+pydantic_core==2.27.2
+pyarrow==19.0.1
+protobuf==5.29.3
+propcache==0.3.0
+nvidia-nvtx-cu11==11.7.91
+nvidia-nccl-cu11==2.14.3
+nvidia-curand-cu11==10.2.10.91
+nvidia-cufft-cu11==10.9.0.58
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cublas-cu11==11.10.3.66
+numpy==1.26.4
+networkx==3.4.2
+multidict==6.1.0
+idna==3.10
+fsspec==2024.9.0
+frozenlist==1.5.0
+filelock==3.17.0
+docker-pycreds==0.4.0
+dill==0.3.8
+cmake==3.31.6
+click==8.1.8
+charset-normalizer==3.4.1
+certifi==2025.1.31
+attrs==25.1.0
+async-timeout==5.0.1
+annotated-types==0.7.0
+aiohappyeyeballs==2.4.8
+yarl==1.18.3
+sentry-sdk==2.22.0
+requests==2.32.3
+pydantic==2.10.6
+pandas==2.2.3
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cudnn-cu11==8.5.0.96
+multiprocess==0.70.16
+gitdb==4.0.12
+aiosignal==1.3.2
+tiktoken==0.8.0
+GitPython==3.1.44
+aiohttp==3.11.13
+wandb==0.19.1
+datasets==3.1.0
+nvidia-cusparse-cu11==11.7.5.86
+triton==3.2.0
+nvidia-cusparselt-cu12==0.6.2
+sympy==1.13.1
+nvidia-nvtx-cu12==12.4.127
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nccl-cu12==2.21.5
+nvidia-curand-cu12==10.3.5.147
+nvidia-cufft-cu12==11.2.1.3
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cusolver-cu12==11.6.1.9
+torch==2.6.0
+jmespath==1.0.1
+botocore==1.37.8
+s3transfer==0.11.4
+boto3==1.37.8
+asciitree==0.3.3
+numcodecs==0.13.1
+fasteners==0.19
+zarr==2.18.3
+widgetsnbextension==4.0.13
+jupyterlab_widgets==3.0.13
+ipywidgets==8.1.5
+pyparsing==3.2.2
+pillow==11.1.0
+kiwisolver==1.4.8
+fonttools==4.56.0
+cycler==0.12.1
+contourpy==1.3.1
+matplotlib==3.10.1
+safetensors==0.5.3
+torchvision==0.21.0
+timm==1.0.15
+word2number==1.1
+sqlitedict==2.1.0
+zstandard==0.23.0
+threadpoolctl==3.6.0
+tcolorpy==0.1.7
+tabulate==0.9.0
+scipy==1.15.2
+pybind11==2.13.6
+portalocker==3.1.1
+pathvalidate==3.2.3
+numexpr==2.10.2
+more-itertools==10.6.0
+lxml==5.3.2
+jsonlines==4.0.0
+joblib==1.4.2
+colorama==0.4.6
+chardet==5.2.0
+absl-py==2.2.2
+tqdm-multiprocess==0.0.11
+scikit-learn==1.6.1
+sacrebleu==2.5.1
+nltk==3.9.1
+mbstrdecoder==1.1.4
+huggingface-hub==0.30.1
+typepy==1.3.4
+tokenizers==0.21.1
+rouge_score==0.1.2
+transformers==4.51.0
+accelerate==1.6.0
+peft==0.15.1
+DataProperty==1.1.0
+tabledata==1.3.4
+evaluate==0.4.3
+pytablewriter==1.2.1
+lm_eval==0.4.8
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.2
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

wandb/run-20250410_080940-pqshro55/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "os":  "Linux-6.5.0-45-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.10.16",
+  "startedAt":  "2025-04-10T08:09:40.539738Z",
+  "args":  [
+    "--model",
+    "hf",
+    "--model_args",
+    "pretrained=robinfaro/GPT2-1B-base,trust_remote_code=True,force_download=True",
+    "--tasks",
+    "commonsense_qa,openbookqa,hellaswag,lambada,sciq",
+    "--device",
+    "cuda:0",
+    "--batch_size",
+    "32",
+    "--output_path",
+    "outputs/evaluation/base_GPT",
+    "--wandb_args",
+    "project=lm-evaluation,name=base_GPT_intial_weights",
+    "--log_samples"
+  ],
+  "program":  "/mloscratch/homes/faro/conda/envs/thesis/bin/lm_eval",
+  "git":  {
+    "remote":  "https://github.com/robinfaro/time-moe.git",
+    "commit":  "209a56c7746e576430987b33efaad3213c829355"
+  },
+  "email":  "robin.faro@epfl.ch",
+  "root":  "/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling",
+  "host":  "interact-0-0",
+  "executable":  "/mloscratch/homes/faro/conda/envs/thesis/bin/python3.10",
+  "cpu_count":  36,
+  "cpu_count_logical":  72,
+  "gpu":  "Tesla V100-SXM2-32GB",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "6399114346496",
+      "used":  "4521100247040"
+    }
+  },
+  "memory":  {
+    "total":  "404270809088"
+  },
+  "cpu":  {
+    "count":  36,
+    "countLogical":  72
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "Tesla V100-SXM2-32GB",
+      "memoryTotal":  "34359738368",
+      "cudaCores":  5120,
+      "architecture":  "Volta"
+    }
+  ],
+  "cudaVersion":  "12.4"
+}

wandb/run-20250410_080940-pqshro55/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,8 @@

+{"time":"2025-04-10T08:09:40.025636927Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp6h_8vy1g/port-18854.txt","pid":18854,"debug":false,"disable-analytics":false}
+{"time":"2025-04-10T08:09:40.025665392Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
+{"time":"2025-04-10T08:09:40.026203437Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":18854}
+{"time":"2025-04-10T08:09:40.026201522Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":36677,"Zone":""}}
+{"time":"2025-04-10T08:09:40.209295737Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:40920"}
+{"time":"2025-04-10T08:09:40.542511639Z","level":"INFO","msg":"handleInformInit: received","streamId":"pqshro55","id":"127.0.0.1:40920"}
+{"time":"2025-04-10T08:09:40.655328707Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"pqshro55","id":"127.0.0.1:40920"}
+{"time":"2025-04-10T08:12:11.07420118Z","level":"INFO","msg":"Parent process exited, terminating service process."}

wandb/run-20250410_080940-pqshro55/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,8 @@

+{"time":"2025-04-10T08:09:40.545238228Z","level":"INFO","msg":"using version","core version":"0.19.1"}
+{"time":"2025-04-10T08:09:40.545267808Z","level":"INFO","msg":"created symlink","path":"/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-core.log"}
+{"time":"2025-04-10T08:09:40.655286034Z","level":"INFO","msg":"created new stream","id":"pqshro55"}
+{"time":"2025-04-10T08:09:40.655321987Z","level":"INFO","msg":"stream: started","id":"pqshro55"}
+{"time":"2025-04-10T08:09:40.655358183Z","level":"INFO","msg":"sender: started","stream_id":"pqshro55"}
+{"time":"2025-04-10T08:09:40.655363065Z","level":"INFO","msg":"writer: Do: started","stream_id":"pqshro55"}
+{"time":"2025-04-10T08:09:40.655386983Z","level":"INFO","msg":"handler: started","stream_id":"pqshro55"}
+{"time":"2025-04-10T08:09:40.92678924Z","level":"INFO","msg":"Starting system monitor"}

wandb/run-20250410_080940-pqshro55/logs/debug.log ADDED Viewed

	@@ -0,0 +1,22 @@

+2025-04-10 08:09:40,528 INFO    MainThread:18854 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
+2025-04-10 08:09:40,528 INFO    MainThread:18854 [wandb_setup.py:_flush():68] Configure stats pid to 18854
+2025-04-10 08:09:40,529 INFO    MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/.config/wandb/settings
+2025-04-10 08:09:40,530 INFO    MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/settings
+2025-04-10 08:09:40,530 INFO    MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from environment variables
+2025-04-10 08:09:40,530 INFO    MainThread:18854 [wandb_init.py:_log_setup():528] Logging user logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug.log
+2025-04-10 08:09:40,531 INFO    MainThread:18854 [wandb_init.py:_log_setup():529] Logging internal logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-internal.log
+2025-04-10 08:09:40,532 INFO    MainThread:18854 [wandb_init.py:init():644] calling init triggers
+2025-04-10 08:09:40,532 INFO    MainThread:18854 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
+config: {}
+2025-04-10 08:09:40,532 INFO    MainThread:18854 [wandb_init.py:init():680] starting backend
+2025-04-10 08:09:40,532 INFO    MainThread:18854 [wandb_init.py:init():684] sending inform_init request
+2025-04-10 08:09:40,538 INFO    MainThread:18854 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-04-10 08:09:40,539 INFO    MainThread:18854 [wandb_init.py:init():697] backend started and connected
+2025-04-10 08:09:40,540 INFO    MainThread:18854 [wandb_init.py:init():790] updated telemetry
+2025-04-10 08:09:40,553 INFO    MainThread:18854 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
+2025-04-10 08:09:40,912 INFO    MainThread:18854 [wandb_init.py:init():874] starting run threads in backend
+2025-04-10 08:09:41,234 INFO    MainThread:18854 [wandb_run.py:_console_start():2374] atexit reg
+2025-04-10 08:09:41,234 INFO    MainThread:18854 [wandb_run.py:_redirect():2224] redirect: wrap_raw
+2025-04-10 08:09:41,234 INFO    MainThread:18854 [wandb_run.py:_redirect():2289] Wrapping output streams.
+2025-04-10 08:09:41,235 INFO    MainThread:18854 [wandb_run.py:_redirect():2314] Redirects installed.
+2025-04-10 08:09:41,238 INFO    MainThread:18854 [wandb_init.py:init():916] run started, returning control to user process

wandb/run-20250410_080940-pqshro55/run-pqshro55.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f21ede142e38539e967b0af5849784cab3e5a00323d4038d8d9a0921dd277b3e
+size 262144