robinfaro commited on
Commit
2966e5f
·
verified ·
1 Parent(s): 91636eb

Upload custom config and model files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20250410_080613-kly9kjv7/run-kly9kjv7.wandb filter=lfs diff=lfs merge=lfs -text
37
+ wandb/run-20250410_080940-pqshro55/run-pqshro55.wandb filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - model_hub_mixin
4
+ - pytorch_model_hub_mixin
5
+ ---
6
+
7
+ This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
8
+ - Code: [More Information Needed]
9
+ - Paper: [More Information Needed]
10
+ - Docs: [More Information Needed]
__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .configuration import MoEGPTConfig
2
+ from .modeling import MoEGPTForCausalLM
aux_losses.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+
6
+ def log_mean(x, dim):
7
+ return torch.logsumexp(x, dim=dim) - torch.log(
8
+ torch.tensor(x.shape[dim], dtype=torch.float32)
9
+ )
10
+
11
+
12
+ def entropy_reg(logits: torch.Tensor, mean_over_batch: bool = True):
13
+ """Entropy regularization for the router."""
14
+
15
+ entropy_l = lambda l: -(l * l.exp()).sum(-1)
16
+ # softmax over experts
17
+ # logits: [batch_size * sequence_length, num_experts]
18
+ logprobs = F.log_softmax(logits, dim=-1)
19
+ if mean_over_batch:
20
+ # take mean probability over batch
21
+ logprobs = log_mean(logprobs, 0)
22
+
23
+ return -entropy_l(logprobs).mean()
24
+
25
+
26
+ # two losses below are adapted from
27
+ # https://github.com/google/flaxformer/blob/b725bd2a51d70e866d819c92de166fbf24425e6a/flaxformer/architectures/moe/routing.py
28
+ def load_balancing_loss(logits: torch.Tensor, expert_indices: torch.Tensor) -> float:
29
+ """Computes auxiliary load balancing loss as in Switch Transformer.
30
+
31
+ See Switch Transformer (https://arxiv.org/abs/2101.03961). This function
32
+ implements the loss function presented in equations (4) - (6). It aims to
33
+ penalize those cases where the routing between experts is unbalanced.
34
+
35
+ Args:
36
+ logits: logits assigned to each expert per token. Shape:
37
+ <float32>[batch_size * sequence_length, num_experts].
38
+ expert_indices: <int>[batch_size * sequence_length, num_selected_experts]
39
+ indices identifying the top num_selected_experts for a given token.
40
+
41
+ Returns:
42
+ The auxiliary loss.
43
+ """
44
+ # num_token = batch_size * sequence_length
45
+ num_token, num_experts = logits.shape
46
+
47
+ # Shape: [batch_size * sequence_length, num_selected_experts, num_experts].
48
+ expert_mask = F.one_hot(expert_indices, num_experts)
49
+ # For a given token, determine if it was routed to a given expert.
50
+ # Shape: [batch_size * sequence_length, num_experts]
51
+ expert_mask, _ = torch.max(expert_mask, dim=-2)
52
+
53
+ # shape [num_experts]
54
+ tokens_per_expert = torch.mean(expert_mask, dim=0, dtype=torch.float32)
55
+
56
+ # compute router probability per expert in log space for numerical stability
57
+ logprobs = F.log_softmax(logits, dim=-1)
58
+ # take mean probability over batch
59
+ # shape [num_experts]
60
+ logprobs = log_mean(logprobs, dim=0)
61
+ router_prob_per_expert = torch.exp(logprobs)
62
+ return (
63
+ torch.mean( # mean over experts
64
+ tokens_per_expert * router_prob_per_expert,
65
+ dtype=torch.float32,
66
+ )
67
+ * num_experts
68
+ )
69
+
70
+
71
+ def router_z_loss(router_logits: torch.Tensor) -> float:
72
+ """Compute router z-loss.
73
+
74
+ The router z-loss was introduced in Designing Effective Sparse Expert Models
75
+ (https://arxiv.org/abs/2202.08906). It encourages router logits to remain
76
+ small in an effort to improve stability.
77
+
78
+ Args:
79
+ router_logits: <float>[batch_size * sequence_length, num_experts]
80
+ router logits
81
+
82
+ Returns:
83
+ Scalar router z-loss.
84
+ """
85
+ num_tokens, _ = router_logits.shape
86
+ log_z = torch.logsumexp(router_logits, dim=-1)
87
+ z_loss = log_z**2
88
+ return torch.sum(z_loss, dtype=torch.float32) / (num_tokens)
config.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "return_dict": true,
3
+ "output_hidden_states": false,
4
+ "output_attentions": false,
5
+ "torchscript": false,
6
+ "torch_dtype": null,
7
+ "use_bfloat16": false,
8
+ "tf_legacy_loss": false,
9
+ "pruned_heads": {},
10
+ "tie_word_embeddings": true,
11
+ "chunk_size_feed_forward": 0,
12
+ "is_encoder_decoder": false,
13
+ "is_decoder": false,
14
+ "cross_attention_hidden_size": null,
15
+ "add_cross_attention": false,
16
+ "tie_encoder_decoder": false,
17
+ "max_length": 20,
18
+ "min_length": 0,
19
+ "do_sample": false,
20
+ "early_stopping": false,
21
+ "num_beams": 1,
22
+ "num_beam_groups": 1,
23
+ "diversity_penalty": 0.0,
24
+ "temperature": 1.0,
25
+ "top_k": 50,
26
+ "top_p": 1.0,
27
+ "typical_p": 1.0,
28
+ "repetition_penalty": 1.0,
29
+ "length_penalty": 1.0,
30
+ "no_repeat_ngram_size": 0,
31
+ "encoder_no_repeat_ngram_size": 0,
32
+ "bad_words_ids": null,
33
+ "num_return_sequences": 1,
34
+ "output_scores": false,
35
+ "return_dict_in_generate": false,
36
+ "forced_bos_token_id": null,
37
+ "forced_eos_token_id": null,
38
+ "remove_invalid_values": false,
39
+ "exponential_decay_length_penalty": null,
40
+ "suppress_tokens": null,
41
+ "begin_suppress_tokens": null,
42
+ "architectures": [
43
+ "MoEGPTForCausalLM"
44
+ ],
45
+ "finetuning_task": null,
46
+ "id2label": {
47
+ "0": "LABEL_0",
48
+ "1": "LABEL_1"
49
+ },
50
+ "label2id": {
51
+ "LABEL_0": 0,
52
+ "LABEL_1": 1
53
+ },
54
+ "tokenizer_class": null,
55
+ "prefix": null,
56
+ "bos_token_id": null,
57
+ "pad_token_id": null,
58
+ "eos_token_id": null,
59
+ "sep_token_id": null,
60
+ "decoder_start_token_id": null,
61
+ "task_specific_params": null,
62
+ "problem_type": null,
63
+ "_name_or_path": "",
64
+ "_attn_implementation_autoset": false,
65
+ "transformers_version": "4.51.0",
66
+ "batch_size": 16,
67
+ "vocab_size": 50304,
68
+ "n_embd": 768,
69
+ "n_layer": 12,
70
+ "n_head": 12,
71
+ "sequence_length": 1024,
72
+ "moe": true,
73
+ "moe_routing": "standard_gating",
74
+ "moe_num_experts": 6,
75
+ "moe_num_experts_per_tok": 2,
76
+ "moe_softmax_order": "softmax_topk",
77
+ "moe_router_loss": "load_balancing_z_loss",
78
+ "moe_aux_loss_factor": 0.01,
79
+ "moe_z_loss_factor": 1.0,
80
+ "mlp_dim_exp_factor": 1.0,
81
+ "dropout": 0.0,
82
+ "bias": false,
83
+ "auto_map": {
84
+ "AutoConfig": "configuration.MoEGPTConfig",
85
+ "AutoModelForCausalLM": "modeling.MoEGPTForCausalLM",
86
+ "AutoTokenizer": "GPT2TokenizerFast"
87
+ },
88
+ "model_type": "moegpt"
89
+ }
configuration.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class MoEGPTConfig(PretrainedConfig):
4
+ model_type = "moegpt"
5
+
6
+ def __init__(
7
+ self,
8
+ vocab_size=50304,
9
+ n_embd=768,
10
+ n_layer=12,
11
+ n_head=12,
12
+ sequence_length=1024,
13
+ moe=False,
14
+ moe_routing="standard_gating",
15
+ moe_num_experts=4,
16
+ moe_num_experts_per_tok=2,
17
+ moe_softmax_order="softmax_topk",
18
+ moe_router_loss="load_balancing_z_loss",
19
+ moe_aux_loss_factor=0.01,
20
+ moe_z_loss_factor=1.0,
21
+ mlp_dim_exp_factor=1.0,
22
+ dropout=0.0,
23
+ bias=False,
24
+ architectures=["MoEGPTForCausalLM"],
25
+ auto_map={
26
+ "AutoConfig": "configuration.MoEGPTConfig",
27
+ "AutoModelForCausalLM": "modeling.MoEGPTForCausalLM",
28
+ "AutoTokenizer": "GPT2TokenizerFast"
29
+ },
30
+ **kwargs,
31
+ ):
32
+ super().__init__(**kwargs)
33
+ self.vocab_size = vocab_size
34
+ self.n_embd = n_embd
35
+ self.n_layer = n_layer
36
+ self.n_head = n_head
37
+ self.sequence_length = sequence_length
38
+ self.moe = moe
39
+ self.moe_routing = moe_routing
40
+ self.moe_num_experts = moe_num_experts
41
+ self.moe_num_experts_per_tok = moe_num_experts_per_tok
42
+ self.moe_softmax_order = moe_softmax_order
43
+ self.moe_router_loss = moe_router_loss
44
+ self.moe_aux_loss_factor = moe_aux_loss_factor
45
+ self.moe_z_loss_factor = moe_z_loss_factor
46
+ self.mlp_dim_exp_factor = mlp_dim_exp_factor
47
+ self.dropout = dropout
48
+ self.bias = bias
49
+ self.architectures = architectures
50
+ self.auto_map = auto_map
51
+
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modeling.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedModel
2
+ from .configuration import MoEGPTConfig
3
+ # importa anche MoE, MaskedMoE, TimeDependantMoE ecc.
4
+ import math
5
+ import inspect
6
+ from typing import Optional, Dict, Any
7
+ from dataclasses import dataclass
8
+ import tiktoken
9
+ import torch
10
+ import torch.nn as nn
11
+ from torch.nn import functional as F
12
+ from huggingface_hub import PyTorchModelHubMixin
13
+ from transformers.utils import ModelOutput
14
+
15
+
16
+ from .moe import (
17
+ #ExpertChoiceMoE,
18
+ MaskedMoE,
19
+ TimeDependantMoE,
20
+ MoE,
21
+ )
22
+
23
+ from .aux_losses import (
24
+ entropy_reg,
25
+ load_balancing_loss,
26
+ router_z_loss,
27
+ )
28
+
29
+ # class Output(ModelOutput):
30
+ # def __init__(self, logits, loss=None, aux_losses=None, router_logits=None):
31
+ # self.logits = logits
32
+ # self.loss = loss
33
+ # self.aux_losses = aux_losses
34
+ # self.router_logits = router_logits
35
+ @dataclass
36
+ class Output(ModelOutput):
37
+ logits: torch.FloatTensor = None
38
+ loss: Optional[torch.FloatTensor] = None
39
+ aux_losses: Optional[Dict[str, torch.FloatTensor]] = None
40
+ router_logits: Optional[torch.FloatTensor] = None
41
+
42
+ def __repr__(self):
43
+ return f"Output(logits={self.logits}, loss={self.loss}, aux_losses={self.aux_losses}, router_logits={self.router_logits})"
44
+
45
+ class LayerNorm(nn.Module):
46
+ """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
47
+
48
+ def __init__(self, ndim, bias):
49
+ super().__init__()
50
+ self.weight = nn.Parameter(torch.ones(ndim))
51
+ self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
52
+
53
+ def forward(self, input):
54
+ return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
55
+
56
+ class CausalSelfAttention(nn.Module):
57
+ def __init__(self, config):
58
+ super().__init__()
59
+ assert config.n_embd % config.n_head == 0
60
+ # key, query, value projections for all heads, but in a batch
61
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
62
+ # output projection
63
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
64
+ # regularization
65
+ self.attn_dropout = nn.Dropout(config.dropout)
66
+ self.resid_dropout = nn.Dropout(config.dropout)
67
+ self.n_head = config.n_head
68
+ self.n_embd = config.n_embd
69
+ self.dropout = config.dropout
70
+ # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
71
+ self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
72
+ if not self.flash:
73
+ print(
74
+ "WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0"
75
+ )
76
+ # causal mask to ensure that attention is only applied to the left in the input sequence
77
+ self.register_buffer(
78
+ "bias",
79
+ torch.tril(
80
+ torch.ones(config.sequence_length, config.sequence_length)
81
+ ).view(1, 1, config.sequence_length, config.sequence_length),
82
+ )
83
+
84
+ def forward(self, x):
85
+ # batch size, sequence length, embedding dimensionality (n_embd)
86
+ (
87
+ B,
88
+ T,
89
+ C,
90
+ ) = x.size()
91
+
92
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
93
+ q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
94
+ # (B, T, nh, hs)
95
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
96
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
97
+
98
+ # (B, nh, T, hs)
99
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
100
+
101
+ # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
102
+ if self.flash:
103
+ # efficient attention using Flash Attention CUDA kernels
104
+ y = torch.nn.functional.scaled_dot_product_attention(
105
+ q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True
106
+ )
107
+ else:
108
+ # manual implementation of attention
109
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
110
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
111
+ att = F.softmax(att, dim=-1)
112
+ att = self.attn_dropout(att)
113
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
114
+ y = (
115
+ y.transpose(1, 2).contiguous().view(B, T, C)
116
+ ) # re-assemble all head outputs side by side
117
+
118
+ # output projection
119
+ y = self.resid_dropout(self.c_proj(y))
120
+ return y
121
+
122
+
123
+ class MLP(nn.Module):
124
+ def __init__(self, config):
125
+ super().__init__()
126
+ self.dim_exp_factor = int(config.mlp_dim_exp_factor * 4)
127
+
128
+ self.c_fc = nn.Linear(
129
+ config.n_embd, self.dim_exp_factor * config.n_embd, bias=config.bias
130
+ )
131
+ self.c_proj = nn.Linear(
132
+ self.dim_exp_factor * config.n_embd, config.n_embd, bias=config.bias
133
+ )
134
+ self.dropout = nn.Dropout(config.dropout)
135
+ self.activation = nn.GELU()
136
+
137
+ def forward(self, x):
138
+ x = self.c_fc(x)
139
+ x = self.activation(x)
140
+ x = self.c_proj(x)
141
+ x = self.dropout(x)
142
+ # need to return same type as the MoE block, but in this case it's empty
143
+ return x, {}
144
+
145
+
146
+ class Block(nn.Module):
147
+ def __init__(self, config):
148
+ super().__init__()
149
+ self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
150
+ self.attn = CausalSelfAttention(config)
151
+ self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
152
+ self.moe_config = config.moe_routing
153
+ if config.moe:
154
+ if config.moe_routing == "standard_gating":
155
+ self.mlp = MoE(config, MLP)
156
+ elif config.moe_routing == "masked":
157
+ self.mlp = TimeDependantMoE(config, MLP)
158
+ #elif config.moe_routing == "expert_choice":
159
+ # self.mlp = ExpertChoiceMoE(config, MLP)
160
+ else:
161
+ raise ValueError(f"Unknown routing: {config.routing}")
162
+ else:
163
+ self.mlp = MLP(config)
164
+
165
+ def forward(self, x, date, *args, **kwargs):
166
+ x = x + self.attn(self.ln_1(x, *args, **kwargs))
167
+ if self.moe_config == "masked":
168
+ x_, logits_and_experts = self.mlp(self.ln_2(x, *args, **kwargs), date)
169
+ else:
170
+ x_, logits_and_experts = self.mlp(self.ln_2(x, *args, **kwargs))
171
+ x = x + x_
172
+ return x, logits_and_experts
173
+
174
+
175
+ class MoEGPTForCausalLM(PreTrainedModel):
176
+ config_class = MoEGPTConfig
177
+ def __init__(self, config):
178
+ super().__init__(config)
179
+ assert config.vocab_size is not None
180
+ assert config.sequence_length is not None
181
+ self.config = config
182
+ self.tokenizer = tiktoken.get_encoding("gpt2")
183
+
184
+ self.transformer = nn.ModuleDict(
185
+ dict(
186
+ wte=nn.Embedding(config.vocab_size, config.n_embd),
187
+ wpe=nn.Embedding(config.sequence_length, config.n_embd),
188
+ drop=nn.Dropout(config.dropout),
189
+ h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
190
+ ln_f=LayerNorm(config.n_embd, bias=config.bias),
191
+ )
192
+ )
193
+
194
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
195
+ # with weight tying when using torch.compile() some warnings get generated:
196
+ # "UserWarning: functional_call was passed multiple values for tied weights.
197
+ # This behavior is deprecated and will be an error in future versions"
198
+ # not 100% sure what this is, so far seems to be harmless. TODO investigate
199
+ self.transformer.wte.weight = (
200
+ self.lm_head.weight
201
+ ) # https://paperswithcode.com/method/weight-tying
202
+
203
+ # init all weights
204
+ self.apply(self._init_weights)
205
+ # apply special scaled init to the residual projections, per GPT-2 paper
206
+ for pn, p in self.named_parameters():
207
+ if pn.endswith("c_proj.weight"):
208
+ torch.nn.init.normal_(
209
+ p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer)
210
+ )
211
+ if pn.endswith("router.weight"):
212
+ # special scaled init to moe router?
213
+ with torch.no_grad():
214
+ dim = 1 if config.moe_routing == "standard_gating" else 0
215
+ std = p.std()
216
+ p.div_(p.sum(dim=dim, keepdim=True))
217
+ p.mul_(std / p.std())
218
+
219
+ def get_router_losses(self, logits, selected_experts, eval=False):
220
+ # logits: (b * seq_len, n_experts)
221
+ # selected_experts: (b * seq_len, topk)
222
+ if eval: # eval mode, compute all losses
223
+ return {
224
+ "moe_entropy_loss": entropy_reg(logits),
225
+ "moe_aux_loss": load_balancing_loss(logits, selected_experts),
226
+ "moe_z_loss": router_z_loss(logits),
227
+ }
228
+ if self.config.moe_router_loss == "entropy":
229
+ return {
230
+ "moe_entropy_loss": entropy_reg(logits),
231
+ }
232
+ elif self.config.moe_router_loss == "load_balancing_only":
233
+ return {
234
+ "moe_aux_loss": load_balancing_loss(logits, selected_experts),
235
+ }
236
+ elif self.config.moe_router_loss == "load_balancing_z_loss":
237
+ return {
238
+ "moe_aux_loss": load_balancing_loss(logits, selected_experts),
239
+ "moe_z_loss": router_z_loss(logits),
240
+ }
241
+ return {}
242
+
243
+ def get_num_params(self, non_embedding=True):
244
+ """
245
+ Return the number of parameters in the model.
246
+ For non-embedding count (default), the position embeddings get subtracted.
247
+ The token embeddings would too, except due to the parameter sharing these
248
+ params are actually used as weights in the final layer, so we include them.
249
+ """
250
+ n_params = sum(p.numel() for p in self.parameters())
251
+ if non_embedding:
252
+ n_params -= self.transformer.wpe.weight.numel()
253
+ return n_params
254
+
255
+ def _init_weights(self, module):
256
+ if isinstance(module, nn.Linear):
257
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
258
+ if module.bias is not None:
259
+ torch.nn.init.zeros_(module.bias)
260
+ elif isinstance(module, nn.Embedding):
261
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
262
+
263
+ def forward(self, idx, date=None, targets=None, get_logits=True, moe=False):
264
+ device = idx.device
265
+ b, t = idx.size()
266
+ assert (
267
+ t <= self.config.sequence_length
268
+ ), f"Cannot forward sequence of length {t}, block size is only {self.config.sequence_length}"
269
+ # shape (1, t)
270
+ if date is None:
271
+ # set all the date to 6
272
+ date = torch.full((1, b), 6, dtype=torch.long, device=device).squeeze(0)
273
+ else:
274
+ date = (date - 2013) // 2 + 1
275
+ date = torch.full((1, b), date, dtype=torch.long, device=device).squeeze(0)
276
+ pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
277
+
278
+ # forward the GPT model itself
279
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
280
+ pos_emb = self.transformer.wpe(
281
+ pos
282
+ ) # position embeddings of shape (1, t, n_embd)
283
+ x = self.transformer.drop(tok_emb + pos_emb)
284
+
285
+ # router logits is a list for each layer's routing, each of shape (b * seq_len, n_experts)
286
+ router_logits = []
287
+ # experts is a list for each layer's selected experts, shape (b * seq_len, topk)
288
+ experts = []
289
+
290
+ # forward pass through all the transformer blocks
291
+ for block in self.transformer.h:
292
+ x, logits_and_experts = block(x, date)
293
+ if len(logits_and_experts) > 0:
294
+ router_logits.append(logits_and_experts["router_logits"])
295
+ experts.append(logits_and_experts["selected_experts"])
296
+ x = self.transformer.ln_f(x)
297
+
298
+ # aux_losses is a dict with keys for different auxiliary losses
299
+ aux_losses = {}
300
+
301
+ if targets is not None:
302
+ # if we are given some desired targets also calculate the loss
303
+ logits = self.lm_head(x)
304
+ loss = F.cross_entropy(
305
+ logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1
306
+ )
307
+ if moe and (self.config.moe_routing == "standard_gating" or self.config.moe_routing == "masked"):
308
+ # calculate the router losses per layer
309
+ for logit, expert_choice in zip(router_logits, experts):
310
+ router_losses = self.get_router_losses(
311
+ logit, expert_choice, eval=not self.training
312
+ )
313
+ for k, v in router_losses.items():
314
+ aux_losses[k] = aux_losses.get(k, 0.0) + v
315
+ if self.training:
316
+ loss += (
317
+ v
318
+ * getattr(self.config, k + "_factor")
319
+ / self.config.n_layer
320
+ )
321
+ else:
322
+ # inference-time mini-optimization: only forward the lm_head on the very last position
323
+ logits = self.lm_head(
324
+ #x[:, [-1], :]
325
+ x
326
+ ) # note: using list [-1] to preserve the time dim
327
+ loss = None
328
+ logits = logits if get_logits else None
329
+ router_logits = (
330
+ torch.stack(router_logits, dim=0) if len(router_logits) > 0 else None
331
+ )
332
+ # return {
333
+ # "logits": logits,
334
+ # "loss": loss,
335
+ # "aux_losses": aux_losses,
336
+ # "router_logits": router_logits,
337
+ # }
338
+ return Output(logits = logits, loss = loss, aux_losses = aux_losses, router_logits = router_logits)
339
+
340
+ def crop_sequence_length(self, sequence_length):
341
+ # model surgery to decrease the block size if necessary
342
+ # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
343
+ # but want to use a smaller block size for some smaller, simpler model
344
+ assert sequence_length <= self.config.sequence_length
345
+ self.config.sequence_length = sequence_length
346
+ self.transformer.wpe.weight = nn.Parameter(
347
+ self.transformer.wpe.weight[:sequence_length]
348
+ )
349
+ for block in self.transformer.h:
350
+ block.attn.bias = block.attn.bias[:, :, :sequence_length, :sequence_length]
351
+
352
+
353
+ def get_parameter_group_specs(self):
354
+ """
355
+ This long function is unfortunately doing something very simple and is being very defensive:
356
+ We are separating out all parameters of the model into two buckets: those that will experience
357
+ weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
358
+ We are then returning the PyTorch optimizer object.
359
+ """
360
+
361
+ # separate out all parameters to those that will and won't experience regularizing weight decay
362
+ decay = set()
363
+ no_decay = set()
364
+ whitelist_weight_modules = (torch.nn.Linear,)
365
+
366
+ BLACKLIST_WEIGHT_MODULES = (
367
+ torch.nn.LayerNorm,
368
+ LayerNorm,
369
+ torch.nn.Embedding,
370
+ )
371
+
372
+ for mn, m in self.named_modules():
373
+ for pn, p in m.named_parameters():
374
+ fpn = "%s.%s" % (mn, pn) if mn else pn # full param name
375
+ # random note: because named_modules and named_parameters are recursive
376
+ # we will see the same tensors p many many times. but doing it this way
377
+ # allows us to know which parent module any tensor p belongs to...
378
+ if pn.endswith("bias"):
379
+ # all biases will not be decayed
380
+ no_decay.add(fpn)
381
+ elif pn.endswith("weight") and isinstance(m, whitelist_weight_modules):
382
+ # weights of whitelist modules will be weight decayed
383
+ decay.add(fpn)
384
+ elif pn.endswith("weight") and isinstance(m, BLACKLIST_WEIGHT_MODULES):
385
+ # weights of blacklist modules will NOT be weight decayed
386
+ no_decay.add(fpn)
387
+
388
+ # subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
389
+ # will appear in the no_decay and decay sets respectively after the above.
390
+ # In addition, because named_parameters() doesn't return duplicates, it
391
+ # will only return the first occurence, key'd by 'transformer.wte.weight', below.
392
+ # so let's manually remove 'lm_head.weight' from decay set. This will include
393
+ # this tensor into optimization via transformer.wte.weight only, and not decayed.
394
+ decay.remove("lm_head.weight")
395
+
396
+ # validate that we considered every parameter
397
+ param_dict = {pn: p for pn, p in self.named_parameters()}
398
+ inter_params = decay & no_decay
399
+ union_params = decay | no_decay
400
+ assert (
401
+ len(inter_params) == 0
402
+ ), "parameters %s made it into both decay/no_decay sets!" % (str(inter_params),)
403
+ assert (
404
+ len(param_dict.keys() - union_params) == 0
405
+ ), "parameters %s were not separated into either decay/no_decay set!" % (
406
+ str(param_dict.keys() - union_params),
407
+ )
408
+
409
+ # create the pytorch optimizer object
410
+ return [
411
+ {"params": sorted(list(decay))},
412
+ {"params": sorted(list(no_decay)), "weight_decay": 0.0},
413
+ ]
414
+
415
+ @torch.no_grad()
416
+ def generate(self, input_ids, max_new_tokens, date = None, temperature=1.0, top_k=None):
417
+ """
418
+ Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
419
+ the sequence max_new_tokens times, feeding the predictions back into the model each time.
420
+ Most likely you'll want to make sure to be in model.eval() mode of operation for this.
421
+ """
422
+ idx = input_ids
423
+ for _ in range(max_new_tokens):
424
+ # if the sequence context is growing too long we must crop it at sequence_length
425
+ idx_cond = (
426
+ idx
427
+ if idx.size(1) <= self.config.sequence_length
428
+ else idx[:, -self.config.sequence_length :]
429
+ )
430
+ # forward the model to get the logits for the index in the sequence
431
+ logits = self(idx_cond, date, get_logits=True).logits
432
+ # pluck the logits at the final step and scale by desired temperature
433
+ logits = logits[:, -1, :] / temperature
434
+ # optionally crop the logits to only the top k options
435
+ if top_k is not None:
436
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
437
+ logits[logits < v[:, [-1]]] = -float("Inf")
438
+ # apply softmax to convert logits to (normalized) probabilities
439
+ probs = F.softmax(logits, dim=-1)
440
+ # sample from the distribution
441
+ idx_next = torch.multinomial(probs, num_samples=1)
442
+ # append sampled index to the running sequence and continue
443
+ idx = torch.cat((idx, idx_next), dim=1)
444
+ # check if we hit the end of the sequence
445
+ if idx_next.item() == self.tokenizer.eot_token:
446
+ break
447
+
448
+ return idx
449
+
450
+ @torch.no_grad()
451
+ def generate_from_string(self, in_str, max_new_tokens, date = None, temperature=1.0, top_k=None):
452
+ idx = (
453
+ torch.tensor(
454
+ self.tokenizer.encode(in_str, allowed_special={"<|endoftext|>"})
455
+ )
456
+ .view(1, -1)
457
+ .to(self.lm_head.weight.device)
458
+ )
459
+ out_idx = (
460
+ self.generate(idx, max_new_tokens, date, temperature, top_k)
461
+ .view(-1)
462
+ .to("cpu")
463
+ .numpy()
464
+ )
465
+ return self.tokenizer.decode(out_idx).split(in_str)[-1]
moe.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple MoE routing implementations that replace the MLP block in a standard transformer.
3
+ References:
4
+ 1) Mistral Source for Mixtral MoEs:
5
+ https://github.com/mistralai/mistral-src
6
+ 2) ST-MoE:
7
+ https://arxiv.org/abs/2202.08906
8
+ 3) Our notepad of MoE resources:
9
+ https://docs.google.com/document/d/1NuQ5jr7V-Jv1ui7p4KrxO_JTz-7bpYcYMmh49EeJ-QA/edit?usp=sharing
10
+ """
11
+ import numpy as np
12
+ import torch
13
+ import torch.nn as nn
14
+ import torch.nn.functional as F
15
+ import bisect
16
+
17
+
18
+
19
+ class MoE(nn.Module):
20
+ """
21
+ Simplest MoE implementation with a linear router and softmax over experts.
22
+
23
+ Note that in this implementation, we simply loop over the experts and
24
+ aggregate the results. This is not the most efficient way to do it, but
25
+ it also avoids the large memory overhead _and_ has no token dropping
26
+ (because we do not need the capacity factor).
27
+ """
28
+
29
+ def __init__(self, config, mlp):
30
+ super().__init__()
31
+ assert config.moe_num_experts > 0
32
+ self.experts = nn.ModuleList(
33
+ [mlp(config=config) for _ in range(config.moe_num_experts)]
34
+ )
35
+ self.router = nn.Linear(config.n_embd, config.moe_num_experts, bias=False)
36
+ self.top_k = config.moe_num_experts_per_tok
37
+ self.softmax_order = config.moe_softmax_order
38
+
39
+ def forward(self, inputs: torch.Tensor):
40
+ # [batch_size * sequence_length, n_embd]
41
+ inputs_squashed = inputs.view(-1, inputs.shape[-1])
42
+ # [batch_size * sequence_length, num_experts]
43
+ router_logits = self.router(inputs_squashed)
44
+
45
+ # note that selected experts will be the same for all orders:
46
+ # softmax doesnt change top-k, but the weights are different
47
+ if self.softmax_order == "softmax_topk":
48
+ all_probs = F.softmax(router_logits, dim=1)
49
+ weights, selected_experts = torch.topk(all_probs, self.top_k)
50
+ elif self.softmax_order == "topk_softmax":
51
+ weights, selected_experts = torch.topk(router_logits, self.top_k)
52
+ weights = F.softmax(weights, dim=-1)
53
+ else:
54
+ raise ValueError(f"Unknown softmax_order: {self.softmax_order}")
55
+
56
+ results = torch.zeros_like(inputs_squashed)
57
+ # naive looping over experts
58
+ for i, expert in enumerate(self.experts):
59
+ batch_idx, nth_expert = torch.where(selected_experts == i)
60
+ output, _ = expert(inputs_squashed[batch_idx])
61
+ results[batch_idx] += weights[batch_idx, nth_expert, None] * output
62
+
63
+ # return results and router logits (for aux loss calculation later)
64
+ return results.view_as(inputs), {
65
+ "router_logits": router_logits,
66
+ "selected_experts": selected_experts,
67
+ }
68
+
69
+
70
+ class DummyExpert(nn.Module):
71
+ def __init__(self, output_size: int):
72
+ super().__init__()
73
+ self._output_size = output_size
74
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
75
+ out = torch.zeros((self._output_size,), device=inputs.device)
76
+ return out, {}
77
+
78
+
79
+
80
+ class MaskedMoE(MoE):
81
+ def __init__(self, config, mlp):
82
+ super().__init__(config, mlp)
83
+ self._sequence_length = config.sequence_length
84
+ self.experts.append(DummyExpert(config.n_embd))
85
+ self.router = nn.Linear(config.n_embd, config.moe_num_experts+1, bias=False)
86
+
87
+
88
+ def forward(self, inputs: torch.Tensor, mask: torch.Tensor):
89
+ seq_len = inputs.shape[1]
90
+ inputs_squashed = inputs.view(-1, inputs.shape[-1])
91
+ router_logits = self.router(inputs_squashed)
92
+ mask = torch.cat(
93
+ (mask, torch.ones((mask.shape[0], 1), device=mask.device)),
94
+ dim=1
95
+ )
96
+ mask = mask.repeat_interleave(seq_len, dim=0)
97
+ router_logits = router_logits*mask
98
+
99
+ # note that selected experts will be the same for all orders:
100
+ # softmax doesnt change top-k, but the weights are different
101
+ if self.softmax_order == "softmax_topk":
102
+ all_probs = F.softmax(router_logits, dim=1)
103
+ weights, selected_experts = torch.topk(all_probs, self.top_k)
104
+ elif self.softmax_order == "topk_softmax":
105
+ weights, selected_experts = torch.topk(router_logits, self.top_k)
106
+ weights = F.softmax(weights, dim=-1)
107
+ else:
108
+ raise ValueError(f"Unknown softmax_order: {self.softmax_order}")
109
+
110
+ results = torch.zeros_like(inputs_squashed)
111
+ # naive looping over experts
112
+ for i, expert in enumerate(self.experts):
113
+ batch_idx, nth_expert = torch.where(selected_experts == i)
114
+ output, _ = expert(inputs_squashed[batch_idx])
115
+ results[batch_idx] += weights[batch_idx, nth_expert, None] * output
116
+
117
+ # return results and router logits (for aux loss calculation later)
118
+ return results.view_as(inputs), {
119
+ "router_logits": router_logits,
120
+ "selected_experts": selected_experts,
121
+ }
122
+
123
+
124
+ class TimeDependantMoE(nn.Module):
125
+ def __init__(self, config, mlp):
126
+ super().__init__()
127
+ self._num_experts = config.moe_num_experts
128
+ self._mask_moe = MaskedMoE(config, mlp)
129
+
130
+ def forward(self, x, date):
131
+ mask_date = torch.zeros(x.shape[0], self._num_experts).to(x.device)
132
+ range_tensor = torch.arange(self._num_experts).unsqueeze(0).to(x.device)
133
+ mask_date = (range_tensor < date.unsqueeze(1)).float()
134
+ return self._mask_moe(x, mask_date)
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "tokenizer_class": "GPT2Tokenizer",
19
+ "unk_token": "<|endoftext|>"
20
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-10T08:09:40.545238228Z","level":"INFO","msg":"using version","core version":"0.19.1"}
2
+ {"time":"2025-04-10T08:09:40.545267808Z","level":"INFO","msg":"created symlink","path":"/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-core.log"}
3
+ {"time":"2025-04-10T08:09:40.655286034Z","level":"INFO","msg":"created new stream","id":"pqshro55"}
4
+ {"time":"2025-04-10T08:09:40.655321987Z","level":"INFO","msg":"stream: started","id":"pqshro55"}
5
+ {"time":"2025-04-10T08:09:40.655358183Z","level":"INFO","msg":"sender: started","stream_id":"pqshro55"}
6
+ {"time":"2025-04-10T08:09:40.655363065Z","level":"INFO","msg":"writer: Do: started","stream_id":"pqshro55"}
7
+ {"time":"2025-04-10T08:09:40.655386983Z","level":"INFO","msg":"handler: started","stream_id":"pqshro55"}
8
+ {"time":"2025-04-10T08:09:40.92678924Z","level":"INFO","msg":"Starting system monitor"}
wandb/debug.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-10 08:09:40,528 INFO MainThread:18854 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
2
+ 2025-04-10 08:09:40,528 INFO MainThread:18854 [wandb_setup.py:_flush():68] Configure stats pid to 18854
3
+ 2025-04-10 08:09:40,529 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/.config/wandb/settings
4
+ 2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/settings
5
+ 2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from environment variables
6
+ 2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_init.py:_log_setup():528] Logging user logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug.log
7
+ 2025-04-10 08:09:40,531 INFO MainThread:18854 [wandb_init.py:_log_setup():529] Logging internal logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-internal.log
8
+ 2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():644] calling init triggers
9
+ 2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
10
+ config: {}
11
+ 2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():680] starting backend
12
+ 2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():684] sending inform_init request
13
+ 2025-04-10 08:09:40,538 INFO MainThread:18854 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-10 08:09:40,539 INFO MainThread:18854 [wandb_init.py:init():697] backend started and connected
15
+ 2025-04-10 08:09:40,540 INFO MainThread:18854 [wandb_init.py:init():790] updated telemetry
16
+ 2025-04-10 08:09:40,553 INFO MainThread:18854 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
17
+ 2025-04-10 08:09:40,912 INFO MainThread:18854 [wandb_init.py:init():874] starting run threads in backend
18
+ 2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_console_start():2374] atexit reg
19
+ 2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_redirect():2224] redirect: wrap_raw
20
+ 2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_redirect():2289] Wrapping output streams.
21
+ 2025-04-10 08:09:41,235 INFO MainThread:18854 [wandb_run.py:_redirect():2314] Redirects installed.
22
+ 2025-04-10 08:09:41,238 INFO MainThread:18854 [wandb_init.py:init():916] run started, returning control to user process
wandb/run-20250410_080613-kly9kjv7/files/config.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.1
4
+ m: []
5
+ python_version: 3.10.16
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 41
12
+ - 49
13
+ - 51
14
+ - 53
15
+ - 55
16
+ - 71
17
+ - 98
18
+ - 100
19
+ "2":
20
+ - 1
21
+ - 5
22
+ - 11
23
+ - 41
24
+ - 49
25
+ - 51
26
+ - 53
27
+ - 55
28
+ - 71
29
+ - 98
30
+ - 100
31
+ "3":
32
+ - 13
33
+ - 23
34
+ - 55
35
+ "4": 3.10.16
36
+ "5": 0.19.1
37
+ "6": 4.51.0
38
+ "8":
39
+ - 5
40
+ "12": 0.19.1
41
+ "13": linux-x86_64
wandb/run-20250410_080613-kly9kjv7/files/output.log ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-10:08:06:24 INFO [__main__:422] Selected Tasks: ['commonsense_qa', 'hellaswag', 'lambada', 'openbookqa', 'sciq']
2
+ 2025-04-10:08:06:24 INFO [evaluator:180] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
3
+ 2025-04-10:08:06:24 INFO [evaluator:218] Initializing hf model, with arguments: {'pretrained': 'robinfaro/GPT2-1B-base', 'trust_remote_code': True, 'force_download': True}
4
+ 2025-04-10:08:06:24 INFO [models.huggingface:136] Using device 'cuda:0'
5
+ config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.95MB/s]
6
+ configuration.py: 100%|██████████████████████████████████████████████████████████████████████████████| 1.65k/1.65k [00:00<00:00, 5.80MB/s]
7
+ 2025-04-10:08:06:25 INFO [models.huggingface:504] Model type cannot be determined. Using default model type 'causal'
8
+ 2025-04-10:08:06:26 INFO [models.huggingface:377] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
9
+ config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 1.60MB/s]
10
+ config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.83MB/s]
11
+ configuration.py: 100%|██████████████████████████████████████████████████████████████████████████████| 1.65k/1.65k [00:00<00:00, 3.04MB/s]
12
+ config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.81MB/s]
13
+ modeling.py: 100%|███████████████████████████████████████████████████████████████████████████████████| 19.6k/19.6k [00:00<00:00, 31.7MB/s]
14
+ aux_losses.py: 100%|█████████████████████████████████████████████████████████████████████████████████| 3.15k/3.15k [00:00<00:00, 6.84MB/s]
15
+ moe.py: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5.39k/5.39k [00:00<00:00, 12.2MB/s]
16
+ model.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████| 6.23G/6.23G [00:19<00:00, 316MB/s]
17
+ Some weights of MoEGPTForCausalLM were not initialized from the model checkpoint at robinfaro/GPT2-1B-base and are newly initialized: ['transformer.wte.weight']
18
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
19
+ generation_config.json: 100%|███████████████████████████████████████████████████████████████████████████| 69.0/69.0 [00:00<00:00, 157kB/s]
20
+ Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/commonsense_qa.py'
21
+ 2025-04-10:08:07:11 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/commonsense_qa.py'
22
+ Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/.huggingface.yaml'
23
+ 2025-04-10:08:07:12 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/.huggingface.yaml'
24
+ Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/dataset_infos.json'
25
+ 2025-04-10:08:07:13 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/dataset_infos.json'
26
+ Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/openbookqa.py'
27
+ 2025-04-10:08:07:29 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/openbookqa.py'
28
+ Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/.huggingface.yaml'
29
+ 2025-04-10:08:07:29 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/.huggingface.yaml'
30
+ Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/dataset_infos.json'
31
+ 2025-04-10:08:07:32 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/dataset_infos.json'
32
+ 2025-04-10:08:07:38 INFO [api.task:426] Building contexts for sciq on rank 0...
33
+ 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 846.01it/s]
34
+ 2025-04-10:08:07:39 INFO [api.task:426] Building contexts for openbookqa on rank 0...
35
+ 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 2241.62it/s]
36
+ 2025-04-10:08:07:39 INFO [api.task:426] Building contexts for lambada_openai on rank 0...
37
+ 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5153/5153 [00:08<00:00, 614.90it/s]
38
+ 2025-04-10:08:07:48 INFO [api.task:426] Building contexts for lambada_standard on rank 0...
39
+ 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5153/5153 [00:08<00:00, 612.52it/s]
40
+ 2025-04-10:08:07:56 INFO [api.task:426] Building contexts for hellaswag on rank 0...
41
+ 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 10042/10042 [00:03<00:00, 2514.18it/s]
42
+ 2025-04-10:08:08:01 INFO [api.task:426] Building contexts for commonsense_qa on rank 0...
43
+ 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1221/1221 [00:02<00:00, 574.82it/s]
44
+ 2025-04-10:08:08:04 INFO [evaluator:542] Running loglikelihood requests
45
+ Traceback (most recent call last):
46
+ File "/mloscratch/homes/faro/conda/envs/thesis/bin/lm_eval", line 8, in <module>
47
+ sys.exit(cli_evaluate())
48
+ File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/__main__.py", line 432, in cli_evaluate
49
+ results = evaluator.simple_evaluate(
50
+ File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/utils.py", line 439, in _wrapper
51
+ return fn(*args, **kwargs)
52
+ File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/evaluator.py", line 333, in simple_evaluate
53
+ results = evaluate(
54
+ File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/utils.py", line 439, in _wrapper
55
+ return fn(*args, **kwargs)
56
+ File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/evaluator.py", line 553, in evaluate
57
+ resps = getattr(lm, reqtype)(cloned_reqs)
58
+ File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/api/model.py", line 378, in loglikelihood
59
+ context_enc, continuation_enc = self._encode_pair(context, continuation)
60
+ File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/api/model.py", line 359, in _encode_pair
61
+ context_enc = self.tok_encode(context)
62
+ File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/models/huggingface.py", line 811, in tok_encode
63
+ encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
64
+ File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2654, in encode
65
+ encoded_inputs = self.encode_plus(
66
+ File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3073, in encode_plus
67
+ return self._encode_plus(
68
+ File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py", line 126, in _encode_plus
69
+ return super()._encode_plus(*args, **kwargs)
70
+ File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py", line 613, in _encode_plus
71
+ batched_output = self._batch_encode_plus(
72
+ File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py", line 116, in _batch_encode_plus
73
+ return super()._batch_encode_plus(*args, **kwargs)
74
+ File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py", line 539, in _batch_encode_plus
75
+ encodings = self._tokenizer.encode_batch(
76
+ KeyboardInterrupt
wandb/run-20250410_080613-kly9kjv7/files/requirements.txt ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wcwidth==0.2.13
2
+ pure_eval==0.2.3
3
+ ptyprocess==0.7.0
4
+ traitlets==5.14.3
5
+ tornado==6.4.1
6
+ pyzmq==26.2.0
7
+ Pygments==2.18.0
8
+ psutil==6.0.0
9
+ prompt_toolkit==3.0.47
10
+ platformdirs==4.3.6
11
+ pexpect==4.9.0
12
+ parso==0.8.4
13
+ nest-asyncio==1.6.0
14
+ executing==2.1.0
15
+ exceptiongroup==1.2.2
16
+ decorator==5.1.1
17
+ debugpy==1.8.5
18
+ matplotlib-inline==0.1.7
19
+ jupyter_core==5.7.2
20
+ jedi==0.19.1
21
+ comm==0.2.2
22
+ asttokens==2.4.1
23
+ stack-data==0.6.3
24
+ jupyter_client==8.6.3
25
+ ipython==8.27.0
26
+ ipykernel==6.29.5
27
+ mpmath==1.3.0
28
+ MarkupSafe==2.1.5
29
+ Jinja2==3.1.4
30
+ wheel==0.45.1
31
+ asttokens==3.0.0
32
+ debugpy==1.8.13
33
+ decorator==5.2.1
34
+ exceptiongroup==1.2.2
35
+ executing==2.1.0
36
+ nest_asyncio==1.6.0
37
+ packaging==24.2
38
+ parso==0.8.4
39
+ pickleshare==0.7.5
40
+ platformdirs==4.3.6
41
+ psutil==7.0.0
42
+ ptyprocess==0.7.0
43
+ pure_eval==0.2.3
44
+ Pygments==2.19.1
45
+ setuptools==75.8.2
46
+ six==1.17.0
47
+ tornado==6.4.2
48
+ traitlets==5.14.3
49
+ typing_extensions==4.12.2
50
+ wcwidth==0.2.13
51
+ zipp==3.21.0
52
+ comm==0.2.2
53
+ importlib_metadata==8.6.1
54
+ jedi==0.19.2
55
+ jupyter_core==5.7.2
56
+ matplotlib-inline==0.1.7
57
+ pexpect==4.9.0
58
+ pip==25.0.1
59
+ prompt_toolkit==3.0.50
60
+ python-dateutil==2.9.0.post0
61
+ pyzmq==26.2.1
62
+ stack_data==0.6.3
63
+ ipython==8.33.0
64
+ jupyter_client==8.6.3
65
+ ipykernel==6.29.5
66
+ pytz==2025.1
67
+ lit==18.1.8
68
+ xxhash==3.5.0
69
+ urllib3==2.3.0
70
+ tzdata==2025.1
71
+ tqdm==4.67.1
72
+ smmap==5.0.2
73
+ setproctitle==1.3.5
74
+ regex==2024.11.6
75
+ PyYAML==6.0.2
76
+ pydantic_core==2.27.2
77
+ pyarrow==19.0.1
78
+ protobuf==5.29.3
79
+ propcache==0.3.0
80
+ nvidia-nvtx-cu11==11.7.91
81
+ nvidia-nccl-cu11==2.14.3
82
+ nvidia-curand-cu11==10.2.10.91
83
+ nvidia-cufft-cu11==10.9.0.58
84
+ nvidia-cuda-runtime-cu11==11.7.99
85
+ nvidia-cuda-nvrtc-cu11==11.7.99
86
+ nvidia-cuda-cupti-cu11==11.7.101
87
+ nvidia-cublas-cu11==11.10.3.66
88
+ numpy==1.26.4
89
+ networkx==3.4.2
90
+ multidict==6.1.0
91
+ idna==3.10
92
+ fsspec==2024.9.0
93
+ frozenlist==1.5.0
94
+ filelock==3.17.0
95
+ docker-pycreds==0.4.0
96
+ dill==0.3.8
97
+ cmake==3.31.6
98
+ click==8.1.8
99
+ charset-normalizer==3.4.1
100
+ certifi==2025.1.31
101
+ attrs==25.1.0
102
+ async-timeout==5.0.1
103
+ annotated-types==0.7.0
104
+ aiohappyeyeballs==2.4.8
105
+ yarl==1.18.3
106
+ sentry-sdk==2.22.0
107
+ requests==2.32.3
108
+ pydantic==2.10.6
109
+ pandas==2.2.3
110
+ nvidia-cusolver-cu11==11.4.0.1
111
+ nvidia-cudnn-cu11==8.5.0.96
112
+ multiprocess==0.70.16
113
+ gitdb==4.0.12
114
+ aiosignal==1.3.2
115
+ tiktoken==0.8.0
116
+ GitPython==3.1.44
117
+ aiohttp==3.11.13
118
+ wandb==0.19.1
119
+ datasets==3.1.0
120
+ nvidia-cusparse-cu11==11.7.5.86
121
+ triton==3.2.0
122
+ nvidia-cusparselt-cu12==0.6.2
123
+ sympy==1.13.1
124
+ nvidia-nvtx-cu12==12.4.127
125
+ nvidia-nvjitlink-cu12==12.4.127
126
+ nvidia-nccl-cu12==2.21.5
127
+ nvidia-curand-cu12==10.3.5.147
128
+ nvidia-cufft-cu12==11.2.1.3
129
+ nvidia-cuda-runtime-cu12==12.4.127
130
+ nvidia-cuda-nvrtc-cu12==12.4.127
131
+ nvidia-cuda-cupti-cu12==12.4.127
132
+ nvidia-cublas-cu12==12.4.5.8
133
+ nvidia-cusparse-cu12==12.3.1.170
134
+ nvidia-cudnn-cu12==9.1.0.70
135
+ nvidia-cusolver-cu12==11.6.1.9
136
+ torch==2.6.0
137
+ jmespath==1.0.1
138
+ botocore==1.37.8
139
+ s3transfer==0.11.4
140
+ boto3==1.37.8
141
+ asciitree==0.3.3
142
+ numcodecs==0.13.1
143
+ fasteners==0.19
144
+ zarr==2.18.3
145
+ widgetsnbextension==4.0.13
146
+ jupyterlab_widgets==3.0.13
147
+ ipywidgets==8.1.5
148
+ pyparsing==3.2.2
149
+ pillow==11.1.0
150
+ kiwisolver==1.4.8
151
+ fonttools==4.56.0
152
+ cycler==0.12.1
153
+ contourpy==1.3.1
154
+ matplotlib==3.10.1
155
+ safetensors==0.5.3
156
+ torchvision==0.21.0
157
+ timm==1.0.15
158
+ word2number==1.1
159
+ sqlitedict==2.1.0
160
+ zstandard==0.23.0
161
+ threadpoolctl==3.6.0
162
+ tcolorpy==0.1.7
163
+ tabulate==0.9.0
164
+ scipy==1.15.2
165
+ pybind11==2.13.6
166
+ portalocker==3.1.1
167
+ pathvalidate==3.2.3
168
+ numexpr==2.10.2
169
+ more-itertools==10.6.0
170
+ lxml==5.3.2
171
+ jsonlines==4.0.0
172
+ joblib==1.4.2
173
+ colorama==0.4.6
174
+ chardet==5.2.0
175
+ absl-py==2.2.2
176
+ tqdm-multiprocess==0.0.11
177
+ scikit-learn==1.6.1
178
+ sacrebleu==2.5.1
179
+ nltk==3.9.1
180
+ mbstrdecoder==1.1.4
181
+ huggingface-hub==0.30.1
182
+ typepy==1.3.4
183
+ tokenizers==0.21.1
184
+ rouge_score==0.1.2
185
+ transformers==4.51.0
186
+ accelerate==1.6.0
187
+ peft==0.15.1
188
+ DataProperty==1.1.0
189
+ tabledata==1.3.4
190
+ evaluate==0.4.3
191
+ pytablewriter==1.2.1
192
+ lm_eval==0.4.8
193
+ autocommand==2.2.2
194
+ backports.tarfile==1.2.0
195
+ importlib_metadata==8.0.0
196
+ inflect==7.3.1
197
+ jaraco.collections==5.1.0
198
+ jaraco.context==5.3.0
199
+ jaraco.functools==4.0.1
200
+ jaraco.text==3.12.1
201
+ more-itertools==10.3.0
202
+ packaging==24.2
203
+ platformdirs==4.2.2
204
+ tomli==2.0.1
205
+ typeguard==4.3.0
206
+ typing_extensions==4.12.2
207
+ wheel==0.43.0
208
+ zipp==3.19.2
wandb/run-20250410_080613-kly9kjv7/files/wandb-metadata.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.5.0-45-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.16",
4
+ "startedAt": "2025-04-10T08:06:13.632140Z",
5
+ "args": [
6
+ "--model",
7
+ "hf",
8
+ "--model_args",
9
+ "pretrained=robinfaro/GPT2-1B-base,trust_remote_code=True,force_download=True",
10
+ "--tasks",
11
+ "commonsense_qa,openbookqa,hellaswag,lambada,sciq",
12
+ "--device",
13
+ "cuda:0",
14
+ "--batch_size",
15
+ "32",
16
+ "--output_path",
17
+ "outputs/evaluation/base_GPT",
18
+ "--wandb_args",
19
+ "project=lm-evaluation,name=base_GPT_intial_weights",
20
+ "--log_samples"
21
+ ],
22
+ "program": "/mloscratch/homes/faro/conda/envs/thesis/bin/lm_eval",
23
+ "git": {
24
+ "remote": "https://github.com/robinfaro/time-moe.git",
25
+ "commit": "209a56c7746e576430987b33efaad3213c829355"
26
+ },
27
+ "email": "robin.faro@epfl.ch",
28
+ "root": "/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling",
29
+ "host": "interact-0-0",
30
+ "executable": "/mloscratch/homes/faro/conda/envs/thesis/bin/python3.10",
31
+ "cpu_count": 36,
32
+ "cpu_count_logical": 72,
33
+ "gpu": "Tesla V100-SXM2-32GB",
34
+ "gpu_count": 1,
35
+ "disk": {
36
+ "/": {
37
+ "total": "6399114346496",
38
+ "used": "4521100476416"
39
+ }
40
+ },
41
+ "memory": {
42
+ "total": "404270809088"
43
+ },
44
+ "cpu": {
45
+ "count": 36,
46
+ "countLogical": 72
47
+ },
48
+ "gpu_nvidia": [
49
+ {
50
+ "name": "Tesla V100-SXM2-32GB",
51
+ "memoryTotal": "34359738368",
52
+ "cudaCores": 5120,
53
+ "architecture": "Volta"
54
+ }
55
+ ],
56
+ "cudaVersion": "12.4"
57
+ }
wandb/run-20250410_080613-kly9kjv7/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":128}}
wandb/run-20250410_080613-kly9kjv7/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-10T08:06:13.116700026Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp0n5clybj/port-17759.txt","pid":17759,"debug":false,"disable-analytics":false}
2
+ {"time":"2025-04-10T08:06:13.116732345Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2025-04-10T08:06:13.117250294Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":17759}
4
+ {"time":"2025-04-10T08:06:13.117259873Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44099,"Zone":""}}
5
+ {"time":"2025-04-10T08:06:13.301381011Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:41864"}
6
+ {"time":"2025-04-10T08:06:13.632552814Z","level":"INFO","msg":"handleInformInit: received","streamId":"kly9kjv7","id":"127.0.0.1:41864"}
7
+ {"time":"2025-04-10T08:06:13.753061117Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"kly9kjv7","id":"127.0.0.1:41864"}
8
+ {"time":"2025-04-10T08:08:22.003469384Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:41864"}
9
+ {"time":"2025-04-10T08:08:22.003813875Z","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2025-04-10T08:08:22.003802722Z","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:41864"}
11
+ {"time":"2025-04-10T08:08:22.003987056Z","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:41864"}
12
+ {"time":"2025-04-10T08:08:22.938316834Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:41864"}
13
+ {"time":"2025-04-10T08:08:22.938355013Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:41864"}
14
+ {"time":"2025-04-10T08:08:22.938384806Z","level":"INFO","msg":"server is closed"}
wandb/run-20250410_080613-kly9kjv7/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-10T08:06:13.634947325Z","level":"INFO","msg":"using version","core version":"0.19.1"}
2
+ {"time":"2025-04-10T08:06:13.634964986Z","level":"INFO","msg":"created symlink","path":"/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080613-kly9kjv7/logs/debug-core.log"}
3
+ {"time":"2025-04-10T08:06:13.752969451Z","level":"INFO","msg":"created new stream","id":"kly9kjv7"}
4
+ {"time":"2025-04-10T08:06:13.753045008Z","level":"INFO","msg":"stream: started","id":"kly9kjv7"}
5
+ {"time":"2025-04-10T08:06:13.753098809Z","level":"INFO","msg":"handler: started","stream_id":"kly9kjv7"}
6
+ {"time":"2025-04-10T08:06:13.753089038Z","level":"INFO","msg":"writer: Do: started","stream_id":"kly9kjv7"}
7
+ {"time":"2025-04-10T08:06:13.75312827Z","level":"INFO","msg":"sender: started","stream_id":"kly9kjv7"}
8
+ {"time":"2025-04-10T08:06:14.101026755Z","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2025-04-10T08:08:22.003825637Z","level":"INFO","msg":"stream: closing","id":"kly9kjv7"}
10
+ {"time":"2025-04-10T08:08:22.003921768Z","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2025-04-10T08:08:22.004950039Z","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2025-04-10T08:08:22.715736266Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-04-10T08:08:22.936723989Z","level":"INFO","msg":"handler: closed","stream_id":"kly9kjv7"}
14
+ {"time":"2025-04-10T08:08:22.93681582Z","level":"INFO","msg":"sender: closed","stream_id":"kly9kjv7"}
15
+ {"time":"2025-04-10T08:08:22.936820808Z","level":"INFO","msg":"writer: Close: closed","stream_id":"kly9kjv7"}
16
+ {"time":"2025-04-10T08:08:22.938157108Z","level":"INFO","msg":"stream: closed","id":"kly9kjv7"}
wandb/run-20250410_080613-kly9kjv7/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-10 08:06:13,618 INFO MainThread:17759 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
2
+ 2025-04-10 08:06:13,619 INFO MainThread:17759 [wandb_setup.py:_flush():68] Configure stats pid to 17759
3
+ 2025-04-10 08:06:13,619 INFO MainThread:17759 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/.config/wandb/settings
4
+ 2025-04-10 08:06:13,619 INFO MainThread:17759 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/settings
5
+ 2025-04-10 08:06:13,619 INFO MainThread:17759 [wandb_setup.py:_flush():68] Loading settings from environment variables
6
+ 2025-04-10 08:06:13,620 INFO MainThread:17759 [wandb_init.py:_log_setup():528] Logging user logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080613-kly9kjv7/logs/debug.log
7
+ 2025-04-10 08:06:13,620 INFO MainThread:17759 [wandb_init.py:_log_setup():529] Logging internal logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080613-kly9kjv7/logs/debug-internal.log
8
+ 2025-04-10 08:06:13,620 INFO MainThread:17759 [wandb_init.py:init():644] calling init triggers
9
+ 2025-04-10 08:06:13,621 INFO MainThread:17759 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
10
+ config: {}
11
+ 2025-04-10 08:06:13,621 INFO MainThread:17759 [wandb_init.py:init():680] starting backend
12
+ 2025-04-10 08:06:13,621 INFO MainThread:17759 [wandb_init.py:init():684] sending inform_init request
13
+ 2025-04-10 08:06:13,630 INFO MainThread:17759 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-10 08:06:13,631 INFO MainThread:17759 [wandb_init.py:init():697] backend started and connected
15
+ 2025-04-10 08:06:13,634 INFO MainThread:17759 [wandb_init.py:init():790] updated telemetry
16
+ 2025-04-10 08:06:13,651 INFO MainThread:17759 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
17
+ 2025-04-10 08:06:14,086 INFO MainThread:17759 [wandb_init.py:init():874] starting run threads in backend
18
+ 2025-04-10 08:06:14,446 INFO MainThread:17759 [wandb_run.py:_console_start():2374] atexit reg
19
+ 2025-04-10 08:06:14,446 INFO MainThread:17759 [wandb_run.py:_redirect():2224] redirect: wrap_raw
20
+ 2025-04-10 08:06:14,446 INFO MainThread:17759 [wandb_run.py:_redirect():2289] Wrapping output streams.
21
+ 2025-04-10 08:06:14,447 INFO MainThread:17759 [wandb_run.py:_redirect():2314] Redirects installed.
22
+ 2025-04-10 08:06:14,450 INFO MainThread:17759 [wandb_init.py:init():916] run started, returning control to user process
23
+ 2025-04-10 08:08:22,004 WARNING MsgRouterThr:17759 [router.py:message_loop():75] message_loop has been closed
wandb/run-20250410_080613-kly9kjv7/run-kly9kjv7.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3470404bd4c37d163a54c1a86cb7beac6b443ebd05b979578d2951589ecbc317
3
+ size 185481
wandb/run-20250410_080940-pqshro55/files/output.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-10:08:09:50 INFO [__main__:422] Selected Tasks: ['commonsense_qa', 'hellaswag', 'lambada', 'openbookqa', 'sciq']
2
+ 2025-04-10:08:09:50 INFO [evaluator:180] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
3
+ 2025-04-10:08:09:50 INFO [evaluator:218] Initializing hf model, with arguments: {'pretrained': 'robinfaro/GPT2-1B-base', 'trust_remote_code': True, 'force_download': True}
4
+ 2025-04-10:08:09:50 INFO [models.huggingface:136] Using device 'cuda:0'
5
+ 2025-04-10:08:09:51 INFO [models.huggingface:504] Model type cannot be determined. Using default model type 'causal'
6
+ 2025-04-10:08:09:52 INFO [models.huggingface:377] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
7
+ config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.01MB/s]
8
+ config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 1.73MB/s]
9
+ configuration.py: 100%|██████████████████████████████████████████████████████████████████████████████| 1.65k/1.65k [00:00<00:00, 3.86MB/s]
10
+ config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 1.80MB/s]
11
+ modeling.py: 100%|███████████████████████████████████████████████████████████████████████████████████| 19.6k/19.6k [00:00<00:00, 32.6MB/s]
12
+ moe.py: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5.39k/5.39k [00:00<00:00, 12.5MB/s]
13
+ aux_losses.py: 100%|█████████████████████████████████████████████████████████████████████████████████| 3.15k/3.15k [00:00<00:00, 7.68MB/s]
14
+ model.safetensors.index.json: 100%|██████████████████████████████████████████████████████████████████| 22.2k/22.2k [00:00<00:00, 15.7MB/s]
15
+ model-00002-of-00002.safetensors: 100%|██████████████████████████████████████████████████████████████| 1.55G/1.55G [00:30<00:00, 50.3MB/s]
16
+ model-00001-of-00002.safetensors: 29%|██████████████████▏ | 1.47G/5.00G [00:30<01:39, 35.5MB/s]
17
+ model-00001-of-00002.safetensors: 83%|███████████████████████████████████████████████████▊ | 4.17G/5.00G [02:14<00:44, 18.5MB/s]
wandb/run-20250410_080940-pqshro55/files/requirements.txt ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wcwidth==0.2.13
2
+ pure_eval==0.2.3
3
+ ptyprocess==0.7.0
4
+ traitlets==5.14.3
5
+ tornado==6.4.1
6
+ pyzmq==26.2.0
7
+ Pygments==2.18.0
8
+ psutil==6.0.0
9
+ prompt_toolkit==3.0.47
10
+ platformdirs==4.3.6
11
+ pexpect==4.9.0
12
+ parso==0.8.4
13
+ nest-asyncio==1.6.0
14
+ executing==2.1.0
15
+ exceptiongroup==1.2.2
16
+ decorator==5.1.1
17
+ debugpy==1.8.5
18
+ matplotlib-inline==0.1.7
19
+ jupyter_core==5.7.2
20
+ jedi==0.19.1
21
+ comm==0.2.2
22
+ asttokens==2.4.1
23
+ stack-data==0.6.3
24
+ jupyter_client==8.6.3
25
+ ipython==8.27.0
26
+ ipykernel==6.29.5
27
+ mpmath==1.3.0
28
+ MarkupSafe==2.1.5
29
+ Jinja2==3.1.4
30
+ wheel==0.45.1
31
+ asttokens==3.0.0
32
+ debugpy==1.8.13
33
+ decorator==5.2.1
34
+ exceptiongroup==1.2.2
35
+ executing==2.1.0
36
+ nest_asyncio==1.6.0
37
+ packaging==24.2
38
+ parso==0.8.4
39
+ pickleshare==0.7.5
40
+ platformdirs==4.3.6
41
+ psutil==7.0.0
42
+ ptyprocess==0.7.0
43
+ pure_eval==0.2.3
44
+ Pygments==2.19.1
45
+ setuptools==75.8.2
46
+ six==1.17.0
47
+ tornado==6.4.2
48
+ traitlets==5.14.3
49
+ typing_extensions==4.12.2
50
+ wcwidth==0.2.13
51
+ zipp==3.21.0
52
+ comm==0.2.2
53
+ importlib_metadata==8.6.1
54
+ jedi==0.19.2
55
+ jupyter_core==5.7.2
56
+ matplotlib-inline==0.1.7
57
+ pexpect==4.9.0
58
+ pip==25.0.1
59
+ prompt_toolkit==3.0.50
60
+ python-dateutil==2.9.0.post0
61
+ pyzmq==26.2.1
62
+ stack_data==0.6.3
63
+ ipython==8.33.0
64
+ jupyter_client==8.6.3
65
+ ipykernel==6.29.5
66
+ pytz==2025.1
67
+ lit==18.1.8
68
+ xxhash==3.5.0
69
+ urllib3==2.3.0
70
+ tzdata==2025.1
71
+ tqdm==4.67.1
72
+ smmap==5.0.2
73
+ setproctitle==1.3.5
74
+ regex==2024.11.6
75
+ PyYAML==6.0.2
76
+ pydantic_core==2.27.2
77
+ pyarrow==19.0.1
78
+ protobuf==5.29.3
79
+ propcache==0.3.0
80
+ nvidia-nvtx-cu11==11.7.91
81
+ nvidia-nccl-cu11==2.14.3
82
+ nvidia-curand-cu11==10.2.10.91
83
+ nvidia-cufft-cu11==10.9.0.58
84
+ nvidia-cuda-runtime-cu11==11.7.99
85
+ nvidia-cuda-nvrtc-cu11==11.7.99
86
+ nvidia-cuda-cupti-cu11==11.7.101
87
+ nvidia-cublas-cu11==11.10.3.66
88
+ numpy==1.26.4
89
+ networkx==3.4.2
90
+ multidict==6.1.0
91
+ idna==3.10
92
+ fsspec==2024.9.0
93
+ frozenlist==1.5.0
94
+ filelock==3.17.0
95
+ docker-pycreds==0.4.0
96
+ dill==0.3.8
97
+ cmake==3.31.6
98
+ click==8.1.8
99
+ charset-normalizer==3.4.1
100
+ certifi==2025.1.31
101
+ attrs==25.1.0
102
+ async-timeout==5.0.1
103
+ annotated-types==0.7.0
104
+ aiohappyeyeballs==2.4.8
105
+ yarl==1.18.3
106
+ sentry-sdk==2.22.0
107
+ requests==2.32.3
108
+ pydantic==2.10.6
109
+ pandas==2.2.3
110
+ nvidia-cusolver-cu11==11.4.0.1
111
+ nvidia-cudnn-cu11==8.5.0.96
112
+ multiprocess==0.70.16
113
+ gitdb==4.0.12
114
+ aiosignal==1.3.2
115
+ tiktoken==0.8.0
116
+ GitPython==3.1.44
117
+ aiohttp==3.11.13
118
+ wandb==0.19.1
119
+ datasets==3.1.0
120
+ nvidia-cusparse-cu11==11.7.5.86
121
+ triton==3.2.0
122
+ nvidia-cusparselt-cu12==0.6.2
123
+ sympy==1.13.1
124
+ nvidia-nvtx-cu12==12.4.127
125
+ nvidia-nvjitlink-cu12==12.4.127
126
+ nvidia-nccl-cu12==2.21.5
127
+ nvidia-curand-cu12==10.3.5.147
128
+ nvidia-cufft-cu12==11.2.1.3
129
+ nvidia-cuda-runtime-cu12==12.4.127
130
+ nvidia-cuda-nvrtc-cu12==12.4.127
131
+ nvidia-cuda-cupti-cu12==12.4.127
132
+ nvidia-cublas-cu12==12.4.5.8
133
+ nvidia-cusparse-cu12==12.3.1.170
134
+ nvidia-cudnn-cu12==9.1.0.70
135
+ nvidia-cusolver-cu12==11.6.1.9
136
+ torch==2.6.0
137
+ jmespath==1.0.1
138
+ botocore==1.37.8
139
+ s3transfer==0.11.4
140
+ boto3==1.37.8
141
+ asciitree==0.3.3
142
+ numcodecs==0.13.1
143
+ fasteners==0.19
144
+ zarr==2.18.3
145
+ widgetsnbextension==4.0.13
146
+ jupyterlab_widgets==3.0.13
147
+ ipywidgets==8.1.5
148
+ pyparsing==3.2.2
149
+ pillow==11.1.0
150
+ kiwisolver==1.4.8
151
+ fonttools==4.56.0
152
+ cycler==0.12.1
153
+ contourpy==1.3.1
154
+ matplotlib==3.10.1
155
+ safetensors==0.5.3
156
+ torchvision==0.21.0
157
+ timm==1.0.15
158
+ word2number==1.1
159
+ sqlitedict==2.1.0
160
+ zstandard==0.23.0
161
+ threadpoolctl==3.6.0
162
+ tcolorpy==0.1.7
163
+ tabulate==0.9.0
164
+ scipy==1.15.2
165
+ pybind11==2.13.6
166
+ portalocker==3.1.1
167
+ pathvalidate==3.2.3
168
+ numexpr==2.10.2
169
+ more-itertools==10.6.0
170
+ lxml==5.3.2
171
+ jsonlines==4.0.0
172
+ joblib==1.4.2
173
+ colorama==0.4.6
174
+ chardet==5.2.0
175
+ absl-py==2.2.2
176
+ tqdm-multiprocess==0.0.11
177
+ scikit-learn==1.6.1
178
+ sacrebleu==2.5.1
179
+ nltk==3.9.1
180
+ mbstrdecoder==1.1.4
181
+ huggingface-hub==0.30.1
182
+ typepy==1.3.4
183
+ tokenizers==0.21.1
184
+ rouge_score==0.1.2
185
+ transformers==4.51.0
186
+ accelerate==1.6.0
187
+ peft==0.15.1
188
+ DataProperty==1.1.0
189
+ tabledata==1.3.4
190
+ evaluate==0.4.3
191
+ pytablewriter==1.2.1
192
+ lm_eval==0.4.8
193
+ autocommand==2.2.2
194
+ backports.tarfile==1.2.0
195
+ importlib_metadata==8.0.0
196
+ inflect==7.3.1
197
+ jaraco.collections==5.1.0
198
+ jaraco.context==5.3.0
199
+ jaraco.functools==4.0.1
200
+ jaraco.text==3.12.1
201
+ more-itertools==10.3.0
202
+ packaging==24.2
203
+ platformdirs==4.2.2
204
+ tomli==2.0.1
205
+ typeguard==4.3.0
206
+ typing_extensions==4.12.2
207
+ wheel==0.43.0
208
+ zipp==3.19.2
wandb/run-20250410_080940-pqshro55/files/wandb-metadata.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.5.0-45-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.16",
4
+ "startedAt": "2025-04-10T08:09:40.539738Z",
5
+ "args": [
6
+ "--model",
7
+ "hf",
8
+ "--model_args",
9
+ "pretrained=robinfaro/GPT2-1B-base,trust_remote_code=True,force_download=True",
10
+ "--tasks",
11
+ "commonsense_qa,openbookqa,hellaswag,lambada,sciq",
12
+ "--device",
13
+ "cuda:0",
14
+ "--batch_size",
15
+ "32",
16
+ "--output_path",
17
+ "outputs/evaluation/base_GPT",
18
+ "--wandb_args",
19
+ "project=lm-evaluation,name=base_GPT_intial_weights",
20
+ "--log_samples"
21
+ ],
22
+ "program": "/mloscratch/homes/faro/conda/envs/thesis/bin/lm_eval",
23
+ "git": {
24
+ "remote": "https://github.com/robinfaro/time-moe.git",
25
+ "commit": "209a56c7746e576430987b33efaad3213c829355"
26
+ },
27
+ "email": "robin.faro@epfl.ch",
28
+ "root": "/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling",
29
+ "host": "interact-0-0",
30
+ "executable": "/mloscratch/homes/faro/conda/envs/thesis/bin/python3.10",
31
+ "cpu_count": 36,
32
+ "cpu_count_logical": 72,
33
+ "gpu": "Tesla V100-SXM2-32GB",
34
+ "gpu_count": 1,
35
+ "disk": {
36
+ "/": {
37
+ "total": "6399114346496",
38
+ "used": "4521100247040"
39
+ }
40
+ },
41
+ "memory": {
42
+ "total": "404270809088"
43
+ },
44
+ "cpu": {
45
+ "count": 36,
46
+ "countLogical": 72
47
+ },
48
+ "gpu_nvidia": [
49
+ {
50
+ "name": "Tesla V100-SXM2-32GB",
51
+ "memoryTotal": "34359738368",
52
+ "cudaCores": 5120,
53
+ "architecture": "Volta"
54
+ }
55
+ ],
56
+ "cudaVersion": "12.4"
57
+ }
wandb/run-20250410_080940-pqshro55/logs/debug-core.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-10T08:09:40.025636927Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp6h_8vy1g/port-18854.txt","pid":18854,"debug":false,"disable-analytics":false}
2
+ {"time":"2025-04-10T08:09:40.025665392Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2025-04-10T08:09:40.026203437Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":18854}
4
+ {"time":"2025-04-10T08:09:40.026201522Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":36677,"Zone":""}}
5
+ {"time":"2025-04-10T08:09:40.209295737Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:40920"}
6
+ {"time":"2025-04-10T08:09:40.542511639Z","level":"INFO","msg":"handleInformInit: received","streamId":"pqshro55","id":"127.0.0.1:40920"}
7
+ {"time":"2025-04-10T08:09:40.655328707Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"pqshro55","id":"127.0.0.1:40920"}
8
+ {"time":"2025-04-10T08:12:11.07420118Z","level":"INFO","msg":"Parent process exited, terminating service process."}
wandb/run-20250410_080940-pqshro55/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-10T08:09:40.545238228Z","level":"INFO","msg":"using version","core version":"0.19.1"}
2
+ {"time":"2025-04-10T08:09:40.545267808Z","level":"INFO","msg":"created symlink","path":"/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-core.log"}
3
+ {"time":"2025-04-10T08:09:40.655286034Z","level":"INFO","msg":"created new stream","id":"pqshro55"}
4
+ {"time":"2025-04-10T08:09:40.655321987Z","level":"INFO","msg":"stream: started","id":"pqshro55"}
5
+ {"time":"2025-04-10T08:09:40.655358183Z","level":"INFO","msg":"sender: started","stream_id":"pqshro55"}
6
+ {"time":"2025-04-10T08:09:40.655363065Z","level":"INFO","msg":"writer: Do: started","stream_id":"pqshro55"}
7
+ {"time":"2025-04-10T08:09:40.655386983Z","level":"INFO","msg":"handler: started","stream_id":"pqshro55"}
8
+ {"time":"2025-04-10T08:09:40.92678924Z","level":"INFO","msg":"Starting system monitor"}
wandb/run-20250410_080940-pqshro55/logs/debug.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-10 08:09:40,528 INFO MainThread:18854 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
2
+ 2025-04-10 08:09:40,528 INFO MainThread:18854 [wandb_setup.py:_flush():68] Configure stats pid to 18854
3
+ 2025-04-10 08:09:40,529 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/.config/wandb/settings
4
+ 2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/settings
5
+ 2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from environment variables
6
+ 2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_init.py:_log_setup():528] Logging user logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug.log
7
+ 2025-04-10 08:09:40,531 INFO MainThread:18854 [wandb_init.py:_log_setup():529] Logging internal logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-internal.log
8
+ 2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():644] calling init triggers
9
+ 2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
10
+ config: {}
11
+ 2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():680] starting backend
12
+ 2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():684] sending inform_init request
13
+ 2025-04-10 08:09:40,538 INFO MainThread:18854 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-10 08:09:40,539 INFO MainThread:18854 [wandb_init.py:init():697] backend started and connected
15
+ 2025-04-10 08:09:40,540 INFO MainThread:18854 [wandb_init.py:init():790] updated telemetry
16
+ 2025-04-10 08:09:40,553 INFO MainThread:18854 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
17
+ 2025-04-10 08:09:40,912 INFO MainThread:18854 [wandb_init.py:init():874] starting run threads in backend
18
+ 2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_console_start():2374] atexit reg
19
+ 2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_redirect():2224] redirect: wrap_raw
20
+ 2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_redirect():2289] Wrapping output streams.
21
+ 2025-04-10 08:09:41,235 INFO MainThread:18854 [wandb_run.py:_redirect():2314] Redirects installed.
22
+ 2025-04-10 08:09:41,238 INFO MainThread:18854 [wandb_init.py:init():916] run started, returning control to user process
wandb/run-20250410_080940-pqshro55/run-pqshro55.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f21ede142e38539e967b0af5849784cab3e5a00323d4038d8d9a0921dd277b3e
3
+ size 262144