Upload custom config and model files
Browse files- .gitattributes +2 -0
- README.md +10 -0
- __init__.py +2 -0
- aux_losses.py +88 -0
- config.json +89 -0
- configuration.py +51 -0
- merges.txt +0 -0
- modeling.py +465 -0
- moe.py +134 -0
- special_tokens_map.json +5 -0
- tokenizer.json +0 -0
- tokenizer_config.json +20 -0
- vocab.json +0 -0
- wandb/debug-internal.log +8 -0
- wandb/debug.log +22 -0
- wandb/run-20250410_080613-kly9kjv7/files/config.yaml +41 -0
- wandb/run-20250410_080613-kly9kjv7/files/output.log +76 -0
- wandb/run-20250410_080613-kly9kjv7/files/requirements.txt +208 -0
- wandb/run-20250410_080613-kly9kjv7/files/wandb-metadata.json +57 -0
- wandb/run-20250410_080613-kly9kjv7/files/wandb-summary.json +1 -0
- wandb/run-20250410_080613-kly9kjv7/logs/debug-core.log +14 -0
- wandb/run-20250410_080613-kly9kjv7/logs/debug-internal.log +16 -0
- wandb/run-20250410_080613-kly9kjv7/logs/debug.log +23 -0
- wandb/run-20250410_080613-kly9kjv7/run-kly9kjv7.wandb +3 -0
- wandb/run-20250410_080940-pqshro55/files/output.log +17 -0
- wandb/run-20250410_080940-pqshro55/files/requirements.txt +208 -0
- wandb/run-20250410_080940-pqshro55/files/wandb-metadata.json +57 -0
- wandb/run-20250410_080940-pqshro55/logs/debug-core.log +8 -0
- wandb/run-20250410_080940-pqshro55/logs/debug-internal.log +8 -0
- wandb/run-20250410_080940-pqshro55/logs/debug.log +22 -0
- wandb/run-20250410_080940-pqshro55/run-pqshro55.wandb +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
wandb/run-20250410_080613-kly9kjv7/run-kly9kjv7.wandb filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
wandb/run-20250410_080940-pqshro55/run-pqshro55.wandb filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- model_hub_mixin
|
| 4 |
+
- pytorch_model_hub_mixin
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
|
| 8 |
+
- Code: [More Information Needed]
|
| 9 |
+
- Paper: [More Information Needed]
|
| 10 |
+
- Docs: [More Information Needed]
|
__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .configuration import MoEGPTConfig
|
| 2 |
+
from .modeling import MoEGPTForCausalLM
|
aux_losses.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def log_mean(x, dim):
|
| 7 |
+
return torch.logsumexp(x, dim=dim) - torch.log(
|
| 8 |
+
torch.tensor(x.shape[dim], dtype=torch.float32)
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def entropy_reg(logits: torch.Tensor, mean_over_batch: bool = True):
|
| 13 |
+
"""Entropy regularization for the router."""
|
| 14 |
+
|
| 15 |
+
entropy_l = lambda l: -(l * l.exp()).sum(-1)
|
| 16 |
+
# softmax over experts
|
| 17 |
+
# logits: [batch_size * sequence_length, num_experts]
|
| 18 |
+
logprobs = F.log_softmax(logits, dim=-1)
|
| 19 |
+
if mean_over_batch:
|
| 20 |
+
# take mean probability over batch
|
| 21 |
+
logprobs = log_mean(logprobs, 0)
|
| 22 |
+
|
| 23 |
+
return -entropy_l(logprobs).mean()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# two losses below are adapted from
|
| 27 |
+
# https://github.com/google/flaxformer/blob/b725bd2a51d70e866d819c92de166fbf24425e6a/flaxformer/architectures/moe/routing.py
|
| 28 |
+
def load_balancing_loss(logits: torch.Tensor, expert_indices: torch.Tensor) -> float:
|
| 29 |
+
"""Computes auxiliary load balancing loss as in Switch Transformer.
|
| 30 |
+
|
| 31 |
+
See Switch Transformer (https://arxiv.org/abs/2101.03961). This function
|
| 32 |
+
implements the loss function presented in equations (4) - (6). It aims to
|
| 33 |
+
penalize those cases where the routing between experts is unbalanced.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
logits: logits assigned to each expert per token. Shape:
|
| 37 |
+
<float32>[batch_size * sequence_length, num_experts].
|
| 38 |
+
expert_indices: <int>[batch_size * sequence_length, num_selected_experts]
|
| 39 |
+
indices identifying the top num_selected_experts for a given token.
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
The auxiliary loss.
|
| 43 |
+
"""
|
| 44 |
+
# num_token = batch_size * sequence_length
|
| 45 |
+
num_token, num_experts = logits.shape
|
| 46 |
+
|
| 47 |
+
# Shape: [batch_size * sequence_length, num_selected_experts, num_experts].
|
| 48 |
+
expert_mask = F.one_hot(expert_indices, num_experts)
|
| 49 |
+
# For a given token, determine if it was routed to a given expert.
|
| 50 |
+
# Shape: [batch_size * sequence_length, num_experts]
|
| 51 |
+
expert_mask, _ = torch.max(expert_mask, dim=-2)
|
| 52 |
+
|
| 53 |
+
# shape [num_experts]
|
| 54 |
+
tokens_per_expert = torch.mean(expert_mask, dim=0, dtype=torch.float32)
|
| 55 |
+
|
| 56 |
+
# compute router probability per expert in log space for numerical stability
|
| 57 |
+
logprobs = F.log_softmax(logits, dim=-1)
|
| 58 |
+
# take mean probability over batch
|
| 59 |
+
# shape [num_experts]
|
| 60 |
+
logprobs = log_mean(logprobs, dim=0)
|
| 61 |
+
router_prob_per_expert = torch.exp(logprobs)
|
| 62 |
+
return (
|
| 63 |
+
torch.mean( # mean over experts
|
| 64 |
+
tokens_per_expert * router_prob_per_expert,
|
| 65 |
+
dtype=torch.float32,
|
| 66 |
+
)
|
| 67 |
+
* num_experts
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def router_z_loss(router_logits: torch.Tensor) -> float:
|
| 72 |
+
"""Compute router z-loss.
|
| 73 |
+
|
| 74 |
+
The router z-loss was introduced in Designing Effective Sparse Expert Models
|
| 75 |
+
(https://arxiv.org/abs/2202.08906). It encourages router logits to remain
|
| 76 |
+
small in an effort to improve stability.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
router_logits: <float>[batch_size * sequence_length, num_experts]
|
| 80 |
+
router logits
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
Scalar router z-loss.
|
| 84 |
+
"""
|
| 85 |
+
num_tokens, _ = router_logits.shape
|
| 86 |
+
log_z = torch.logsumexp(router_logits, dim=-1)
|
| 87 |
+
z_loss = log_z**2
|
| 88 |
+
return torch.sum(z_loss, dtype=torch.float32) / (num_tokens)
|
config.json
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"return_dict": true,
|
| 3 |
+
"output_hidden_states": false,
|
| 4 |
+
"output_attentions": false,
|
| 5 |
+
"torchscript": false,
|
| 6 |
+
"torch_dtype": null,
|
| 7 |
+
"use_bfloat16": false,
|
| 8 |
+
"tf_legacy_loss": false,
|
| 9 |
+
"pruned_heads": {},
|
| 10 |
+
"tie_word_embeddings": true,
|
| 11 |
+
"chunk_size_feed_forward": 0,
|
| 12 |
+
"is_encoder_decoder": false,
|
| 13 |
+
"is_decoder": false,
|
| 14 |
+
"cross_attention_hidden_size": null,
|
| 15 |
+
"add_cross_attention": false,
|
| 16 |
+
"tie_encoder_decoder": false,
|
| 17 |
+
"max_length": 20,
|
| 18 |
+
"min_length": 0,
|
| 19 |
+
"do_sample": false,
|
| 20 |
+
"early_stopping": false,
|
| 21 |
+
"num_beams": 1,
|
| 22 |
+
"num_beam_groups": 1,
|
| 23 |
+
"diversity_penalty": 0.0,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": 50,
|
| 26 |
+
"top_p": 1.0,
|
| 27 |
+
"typical_p": 1.0,
|
| 28 |
+
"repetition_penalty": 1.0,
|
| 29 |
+
"length_penalty": 1.0,
|
| 30 |
+
"no_repeat_ngram_size": 0,
|
| 31 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 32 |
+
"bad_words_ids": null,
|
| 33 |
+
"num_return_sequences": 1,
|
| 34 |
+
"output_scores": false,
|
| 35 |
+
"return_dict_in_generate": false,
|
| 36 |
+
"forced_bos_token_id": null,
|
| 37 |
+
"forced_eos_token_id": null,
|
| 38 |
+
"remove_invalid_values": false,
|
| 39 |
+
"exponential_decay_length_penalty": null,
|
| 40 |
+
"suppress_tokens": null,
|
| 41 |
+
"begin_suppress_tokens": null,
|
| 42 |
+
"architectures": [
|
| 43 |
+
"MoEGPTForCausalLM"
|
| 44 |
+
],
|
| 45 |
+
"finetuning_task": null,
|
| 46 |
+
"id2label": {
|
| 47 |
+
"0": "LABEL_0",
|
| 48 |
+
"1": "LABEL_1"
|
| 49 |
+
},
|
| 50 |
+
"label2id": {
|
| 51 |
+
"LABEL_0": 0,
|
| 52 |
+
"LABEL_1": 1
|
| 53 |
+
},
|
| 54 |
+
"tokenizer_class": null,
|
| 55 |
+
"prefix": null,
|
| 56 |
+
"bos_token_id": null,
|
| 57 |
+
"pad_token_id": null,
|
| 58 |
+
"eos_token_id": null,
|
| 59 |
+
"sep_token_id": null,
|
| 60 |
+
"decoder_start_token_id": null,
|
| 61 |
+
"task_specific_params": null,
|
| 62 |
+
"problem_type": null,
|
| 63 |
+
"_name_or_path": "",
|
| 64 |
+
"_attn_implementation_autoset": false,
|
| 65 |
+
"transformers_version": "4.51.0",
|
| 66 |
+
"batch_size": 16,
|
| 67 |
+
"vocab_size": 50304,
|
| 68 |
+
"n_embd": 768,
|
| 69 |
+
"n_layer": 12,
|
| 70 |
+
"n_head": 12,
|
| 71 |
+
"sequence_length": 1024,
|
| 72 |
+
"moe": true,
|
| 73 |
+
"moe_routing": "standard_gating",
|
| 74 |
+
"moe_num_experts": 6,
|
| 75 |
+
"moe_num_experts_per_tok": 2,
|
| 76 |
+
"moe_softmax_order": "softmax_topk",
|
| 77 |
+
"moe_router_loss": "load_balancing_z_loss",
|
| 78 |
+
"moe_aux_loss_factor": 0.01,
|
| 79 |
+
"moe_z_loss_factor": 1.0,
|
| 80 |
+
"mlp_dim_exp_factor": 1.0,
|
| 81 |
+
"dropout": 0.0,
|
| 82 |
+
"bias": false,
|
| 83 |
+
"auto_map": {
|
| 84 |
+
"AutoConfig": "configuration.MoEGPTConfig",
|
| 85 |
+
"AutoModelForCausalLM": "modeling.MoEGPTForCausalLM",
|
| 86 |
+
"AutoTokenizer": "GPT2TokenizerFast"
|
| 87 |
+
},
|
| 88 |
+
"model_type": "moegpt"
|
| 89 |
+
}
|
configuration.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import PretrainedConfig
|
| 2 |
+
|
| 3 |
+
class MoEGPTConfig(PretrainedConfig):
|
| 4 |
+
model_type = "moegpt"
|
| 5 |
+
|
| 6 |
+
def __init__(
|
| 7 |
+
self,
|
| 8 |
+
vocab_size=50304,
|
| 9 |
+
n_embd=768,
|
| 10 |
+
n_layer=12,
|
| 11 |
+
n_head=12,
|
| 12 |
+
sequence_length=1024,
|
| 13 |
+
moe=False,
|
| 14 |
+
moe_routing="standard_gating",
|
| 15 |
+
moe_num_experts=4,
|
| 16 |
+
moe_num_experts_per_tok=2,
|
| 17 |
+
moe_softmax_order="softmax_topk",
|
| 18 |
+
moe_router_loss="load_balancing_z_loss",
|
| 19 |
+
moe_aux_loss_factor=0.01,
|
| 20 |
+
moe_z_loss_factor=1.0,
|
| 21 |
+
mlp_dim_exp_factor=1.0,
|
| 22 |
+
dropout=0.0,
|
| 23 |
+
bias=False,
|
| 24 |
+
architectures=["MoEGPTForCausalLM"],
|
| 25 |
+
auto_map={
|
| 26 |
+
"AutoConfig": "configuration.MoEGPTConfig",
|
| 27 |
+
"AutoModelForCausalLM": "modeling.MoEGPTForCausalLM",
|
| 28 |
+
"AutoTokenizer": "GPT2TokenizerFast"
|
| 29 |
+
},
|
| 30 |
+
**kwargs,
|
| 31 |
+
):
|
| 32 |
+
super().__init__(**kwargs)
|
| 33 |
+
self.vocab_size = vocab_size
|
| 34 |
+
self.n_embd = n_embd
|
| 35 |
+
self.n_layer = n_layer
|
| 36 |
+
self.n_head = n_head
|
| 37 |
+
self.sequence_length = sequence_length
|
| 38 |
+
self.moe = moe
|
| 39 |
+
self.moe_routing = moe_routing
|
| 40 |
+
self.moe_num_experts = moe_num_experts
|
| 41 |
+
self.moe_num_experts_per_tok = moe_num_experts_per_tok
|
| 42 |
+
self.moe_softmax_order = moe_softmax_order
|
| 43 |
+
self.moe_router_loss = moe_router_loss
|
| 44 |
+
self.moe_aux_loss_factor = moe_aux_loss_factor
|
| 45 |
+
self.moe_z_loss_factor = moe_z_loss_factor
|
| 46 |
+
self.mlp_dim_exp_factor = mlp_dim_exp_factor
|
| 47 |
+
self.dropout = dropout
|
| 48 |
+
self.bias = bias
|
| 49 |
+
self.architectures = architectures
|
| 50 |
+
self.auto_map = auto_map
|
| 51 |
+
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
modeling.py
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import PreTrainedModel
|
| 2 |
+
from .configuration import MoEGPTConfig
|
| 3 |
+
# importa anche MoE, MaskedMoE, TimeDependantMoE ecc.
|
| 4 |
+
import math
|
| 5 |
+
import inspect
|
| 6 |
+
from typing import Optional, Dict, Any
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
import tiktoken
|
| 9 |
+
import torch
|
| 10 |
+
import torch.nn as nn
|
| 11 |
+
from torch.nn import functional as F
|
| 12 |
+
from huggingface_hub import PyTorchModelHubMixin
|
| 13 |
+
from transformers.utils import ModelOutput
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
from .moe import (
|
| 17 |
+
#ExpertChoiceMoE,
|
| 18 |
+
MaskedMoE,
|
| 19 |
+
TimeDependantMoE,
|
| 20 |
+
MoE,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
from .aux_losses import (
|
| 24 |
+
entropy_reg,
|
| 25 |
+
load_balancing_loss,
|
| 26 |
+
router_z_loss,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# class Output(ModelOutput):
|
| 30 |
+
# def __init__(self, logits, loss=None, aux_losses=None, router_logits=None):
|
| 31 |
+
# self.logits = logits
|
| 32 |
+
# self.loss = loss
|
| 33 |
+
# self.aux_losses = aux_losses
|
| 34 |
+
# self.router_logits = router_logits
|
| 35 |
+
@dataclass
|
| 36 |
+
class Output(ModelOutput):
|
| 37 |
+
logits: torch.FloatTensor = None
|
| 38 |
+
loss: Optional[torch.FloatTensor] = None
|
| 39 |
+
aux_losses: Optional[Dict[str, torch.FloatTensor]] = None
|
| 40 |
+
router_logits: Optional[torch.FloatTensor] = None
|
| 41 |
+
|
| 42 |
+
def __repr__(self):
|
| 43 |
+
return f"Output(logits={self.logits}, loss={self.loss}, aux_losses={self.aux_losses}, router_logits={self.router_logits})"
|
| 44 |
+
|
| 45 |
+
class LayerNorm(nn.Module):
|
| 46 |
+
"""LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
|
| 47 |
+
|
| 48 |
+
def __init__(self, ndim, bias):
|
| 49 |
+
super().__init__()
|
| 50 |
+
self.weight = nn.Parameter(torch.ones(ndim))
|
| 51 |
+
self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
|
| 52 |
+
|
| 53 |
+
def forward(self, input):
|
| 54 |
+
return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
|
| 55 |
+
|
| 56 |
+
class CausalSelfAttention(nn.Module):
|
| 57 |
+
def __init__(self, config):
|
| 58 |
+
super().__init__()
|
| 59 |
+
assert config.n_embd % config.n_head == 0
|
| 60 |
+
# key, query, value projections for all heads, but in a batch
|
| 61 |
+
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
|
| 62 |
+
# output projection
|
| 63 |
+
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
|
| 64 |
+
# regularization
|
| 65 |
+
self.attn_dropout = nn.Dropout(config.dropout)
|
| 66 |
+
self.resid_dropout = nn.Dropout(config.dropout)
|
| 67 |
+
self.n_head = config.n_head
|
| 68 |
+
self.n_embd = config.n_embd
|
| 69 |
+
self.dropout = config.dropout
|
| 70 |
+
# flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
|
| 71 |
+
self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
|
| 72 |
+
if not self.flash:
|
| 73 |
+
print(
|
| 74 |
+
"WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0"
|
| 75 |
+
)
|
| 76 |
+
# causal mask to ensure that attention is only applied to the left in the input sequence
|
| 77 |
+
self.register_buffer(
|
| 78 |
+
"bias",
|
| 79 |
+
torch.tril(
|
| 80 |
+
torch.ones(config.sequence_length, config.sequence_length)
|
| 81 |
+
).view(1, 1, config.sequence_length, config.sequence_length),
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
def forward(self, x):
|
| 85 |
+
# batch size, sequence length, embedding dimensionality (n_embd)
|
| 86 |
+
(
|
| 87 |
+
B,
|
| 88 |
+
T,
|
| 89 |
+
C,
|
| 90 |
+
) = x.size()
|
| 91 |
+
|
| 92 |
+
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
|
| 93 |
+
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
|
| 94 |
+
# (B, T, nh, hs)
|
| 95 |
+
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
| 96 |
+
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
| 97 |
+
|
| 98 |
+
# (B, nh, T, hs)
|
| 99 |
+
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
| 100 |
+
|
| 101 |
+
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
|
| 102 |
+
if self.flash:
|
| 103 |
+
# efficient attention using Flash Attention CUDA kernels
|
| 104 |
+
y = torch.nn.functional.scaled_dot_product_attention(
|
| 105 |
+
q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True
|
| 106 |
+
)
|
| 107 |
+
else:
|
| 108 |
+
# manual implementation of attention
|
| 109 |
+
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
|
| 110 |
+
att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
|
| 111 |
+
att = F.softmax(att, dim=-1)
|
| 112 |
+
att = self.attn_dropout(att)
|
| 113 |
+
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
|
| 114 |
+
y = (
|
| 115 |
+
y.transpose(1, 2).contiguous().view(B, T, C)
|
| 116 |
+
) # re-assemble all head outputs side by side
|
| 117 |
+
|
| 118 |
+
# output projection
|
| 119 |
+
y = self.resid_dropout(self.c_proj(y))
|
| 120 |
+
return y
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
class MLP(nn.Module):
|
| 124 |
+
def __init__(self, config):
|
| 125 |
+
super().__init__()
|
| 126 |
+
self.dim_exp_factor = int(config.mlp_dim_exp_factor * 4)
|
| 127 |
+
|
| 128 |
+
self.c_fc = nn.Linear(
|
| 129 |
+
config.n_embd, self.dim_exp_factor * config.n_embd, bias=config.bias
|
| 130 |
+
)
|
| 131 |
+
self.c_proj = nn.Linear(
|
| 132 |
+
self.dim_exp_factor * config.n_embd, config.n_embd, bias=config.bias
|
| 133 |
+
)
|
| 134 |
+
self.dropout = nn.Dropout(config.dropout)
|
| 135 |
+
self.activation = nn.GELU()
|
| 136 |
+
|
| 137 |
+
def forward(self, x):
|
| 138 |
+
x = self.c_fc(x)
|
| 139 |
+
x = self.activation(x)
|
| 140 |
+
x = self.c_proj(x)
|
| 141 |
+
x = self.dropout(x)
|
| 142 |
+
# need to return same type as the MoE block, but in this case it's empty
|
| 143 |
+
return x, {}
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class Block(nn.Module):
|
| 147 |
+
def __init__(self, config):
|
| 148 |
+
super().__init__()
|
| 149 |
+
self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
|
| 150 |
+
self.attn = CausalSelfAttention(config)
|
| 151 |
+
self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
|
| 152 |
+
self.moe_config = config.moe_routing
|
| 153 |
+
if config.moe:
|
| 154 |
+
if config.moe_routing == "standard_gating":
|
| 155 |
+
self.mlp = MoE(config, MLP)
|
| 156 |
+
elif config.moe_routing == "masked":
|
| 157 |
+
self.mlp = TimeDependantMoE(config, MLP)
|
| 158 |
+
#elif config.moe_routing == "expert_choice":
|
| 159 |
+
# self.mlp = ExpertChoiceMoE(config, MLP)
|
| 160 |
+
else:
|
| 161 |
+
raise ValueError(f"Unknown routing: {config.routing}")
|
| 162 |
+
else:
|
| 163 |
+
self.mlp = MLP(config)
|
| 164 |
+
|
| 165 |
+
def forward(self, x, date, *args, **kwargs):
|
| 166 |
+
x = x + self.attn(self.ln_1(x, *args, **kwargs))
|
| 167 |
+
if self.moe_config == "masked":
|
| 168 |
+
x_, logits_and_experts = self.mlp(self.ln_2(x, *args, **kwargs), date)
|
| 169 |
+
else:
|
| 170 |
+
x_, logits_and_experts = self.mlp(self.ln_2(x, *args, **kwargs))
|
| 171 |
+
x = x + x_
|
| 172 |
+
return x, logits_and_experts
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
class MoEGPTForCausalLM(PreTrainedModel):
|
| 176 |
+
config_class = MoEGPTConfig
|
| 177 |
+
def __init__(self, config):
|
| 178 |
+
super().__init__(config)
|
| 179 |
+
assert config.vocab_size is not None
|
| 180 |
+
assert config.sequence_length is not None
|
| 181 |
+
self.config = config
|
| 182 |
+
self.tokenizer = tiktoken.get_encoding("gpt2")
|
| 183 |
+
|
| 184 |
+
self.transformer = nn.ModuleDict(
|
| 185 |
+
dict(
|
| 186 |
+
wte=nn.Embedding(config.vocab_size, config.n_embd),
|
| 187 |
+
wpe=nn.Embedding(config.sequence_length, config.n_embd),
|
| 188 |
+
drop=nn.Dropout(config.dropout),
|
| 189 |
+
h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
|
| 190 |
+
ln_f=LayerNorm(config.n_embd, bias=config.bias),
|
| 191 |
+
)
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
| 195 |
+
# with weight tying when using torch.compile() some warnings get generated:
|
| 196 |
+
# "UserWarning: functional_call was passed multiple values for tied weights.
|
| 197 |
+
# This behavior is deprecated and will be an error in future versions"
|
| 198 |
+
# not 100% sure what this is, so far seems to be harmless. TODO investigate
|
| 199 |
+
self.transformer.wte.weight = (
|
| 200 |
+
self.lm_head.weight
|
| 201 |
+
) # https://paperswithcode.com/method/weight-tying
|
| 202 |
+
|
| 203 |
+
# init all weights
|
| 204 |
+
self.apply(self._init_weights)
|
| 205 |
+
# apply special scaled init to the residual projections, per GPT-2 paper
|
| 206 |
+
for pn, p in self.named_parameters():
|
| 207 |
+
if pn.endswith("c_proj.weight"):
|
| 208 |
+
torch.nn.init.normal_(
|
| 209 |
+
p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer)
|
| 210 |
+
)
|
| 211 |
+
if pn.endswith("router.weight"):
|
| 212 |
+
# special scaled init to moe router?
|
| 213 |
+
with torch.no_grad():
|
| 214 |
+
dim = 1 if config.moe_routing == "standard_gating" else 0
|
| 215 |
+
std = p.std()
|
| 216 |
+
p.div_(p.sum(dim=dim, keepdim=True))
|
| 217 |
+
p.mul_(std / p.std())
|
| 218 |
+
|
| 219 |
+
def get_router_losses(self, logits, selected_experts, eval=False):
|
| 220 |
+
# logits: (b * seq_len, n_experts)
|
| 221 |
+
# selected_experts: (b * seq_len, topk)
|
| 222 |
+
if eval: # eval mode, compute all losses
|
| 223 |
+
return {
|
| 224 |
+
"moe_entropy_loss": entropy_reg(logits),
|
| 225 |
+
"moe_aux_loss": load_balancing_loss(logits, selected_experts),
|
| 226 |
+
"moe_z_loss": router_z_loss(logits),
|
| 227 |
+
}
|
| 228 |
+
if self.config.moe_router_loss == "entropy":
|
| 229 |
+
return {
|
| 230 |
+
"moe_entropy_loss": entropy_reg(logits),
|
| 231 |
+
}
|
| 232 |
+
elif self.config.moe_router_loss == "load_balancing_only":
|
| 233 |
+
return {
|
| 234 |
+
"moe_aux_loss": load_balancing_loss(logits, selected_experts),
|
| 235 |
+
}
|
| 236 |
+
elif self.config.moe_router_loss == "load_balancing_z_loss":
|
| 237 |
+
return {
|
| 238 |
+
"moe_aux_loss": load_balancing_loss(logits, selected_experts),
|
| 239 |
+
"moe_z_loss": router_z_loss(logits),
|
| 240 |
+
}
|
| 241 |
+
return {}
|
| 242 |
+
|
| 243 |
+
def get_num_params(self, non_embedding=True):
|
| 244 |
+
"""
|
| 245 |
+
Return the number of parameters in the model.
|
| 246 |
+
For non-embedding count (default), the position embeddings get subtracted.
|
| 247 |
+
The token embeddings would too, except due to the parameter sharing these
|
| 248 |
+
params are actually used as weights in the final layer, so we include them.
|
| 249 |
+
"""
|
| 250 |
+
n_params = sum(p.numel() for p in self.parameters())
|
| 251 |
+
if non_embedding:
|
| 252 |
+
n_params -= self.transformer.wpe.weight.numel()
|
| 253 |
+
return n_params
|
| 254 |
+
|
| 255 |
+
def _init_weights(self, module):
|
| 256 |
+
if isinstance(module, nn.Linear):
|
| 257 |
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
| 258 |
+
if module.bias is not None:
|
| 259 |
+
torch.nn.init.zeros_(module.bias)
|
| 260 |
+
elif isinstance(module, nn.Embedding):
|
| 261 |
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
| 262 |
+
|
| 263 |
+
def forward(self, idx, date=None, targets=None, get_logits=True, moe=False):
|
| 264 |
+
device = idx.device
|
| 265 |
+
b, t = idx.size()
|
| 266 |
+
assert (
|
| 267 |
+
t <= self.config.sequence_length
|
| 268 |
+
), f"Cannot forward sequence of length {t}, block size is only {self.config.sequence_length}"
|
| 269 |
+
# shape (1, t)
|
| 270 |
+
if date is None:
|
| 271 |
+
# set all the date to 6
|
| 272 |
+
date = torch.full((1, b), 6, dtype=torch.long, device=device).squeeze(0)
|
| 273 |
+
else:
|
| 274 |
+
date = (date - 2013) // 2 + 1
|
| 275 |
+
date = torch.full((1, b), date, dtype=torch.long, device=device).squeeze(0)
|
| 276 |
+
pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
|
| 277 |
+
|
| 278 |
+
# forward the GPT model itself
|
| 279 |
+
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
|
| 280 |
+
pos_emb = self.transformer.wpe(
|
| 281 |
+
pos
|
| 282 |
+
) # position embeddings of shape (1, t, n_embd)
|
| 283 |
+
x = self.transformer.drop(tok_emb + pos_emb)
|
| 284 |
+
|
| 285 |
+
# router logits is a list for each layer's routing, each of shape (b * seq_len, n_experts)
|
| 286 |
+
router_logits = []
|
| 287 |
+
# experts is a list for each layer's selected experts, shape (b * seq_len, topk)
|
| 288 |
+
experts = []
|
| 289 |
+
|
| 290 |
+
# forward pass through all the transformer blocks
|
| 291 |
+
for block in self.transformer.h:
|
| 292 |
+
x, logits_and_experts = block(x, date)
|
| 293 |
+
if len(logits_and_experts) > 0:
|
| 294 |
+
router_logits.append(logits_and_experts["router_logits"])
|
| 295 |
+
experts.append(logits_and_experts["selected_experts"])
|
| 296 |
+
x = self.transformer.ln_f(x)
|
| 297 |
+
|
| 298 |
+
# aux_losses is a dict with keys for different auxiliary losses
|
| 299 |
+
aux_losses = {}
|
| 300 |
+
|
| 301 |
+
if targets is not None:
|
| 302 |
+
# if we are given some desired targets also calculate the loss
|
| 303 |
+
logits = self.lm_head(x)
|
| 304 |
+
loss = F.cross_entropy(
|
| 305 |
+
logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1
|
| 306 |
+
)
|
| 307 |
+
if moe and (self.config.moe_routing == "standard_gating" or self.config.moe_routing == "masked"):
|
| 308 |
+
# calculate the router losses per layer
|
| 309 |
+
for logit, expert_choice in zip(router_logits, experts):
|
| 310 |
+
router_losses = self.get_router_losses(
|
| 311 |
+
logit, expert_choice, eval=not self.training
|
| 312 |
+
)
|
| 313 |
+
for k, v in router_losses.items():
|
| 314 |
+
aux_losses[k] = aux_losses.get(k, 0.0) + v
|
| 315 |
+
if self.training:
|
| 316 |
+
loss += (
|
| 317 |
+
v
|
| 318 |
+
* getattr(self.config, k + "_factor")
|
| 319 |
+
/ self.config.n_layer
|
| 320 |
+
)
|
| 321 |
+
else:
|
| 322 |
+
# inference-time mini-optimization: only forward the lm_head on the very last position
|
| 323 |
+
logits = self.lm_head(
|
| 324 |
+
#x[:, [-1], :]
|
| 325 |
+
x
|
| 326 |
+
) # note: using list [-1] to preserve the time dim
|
| 327 |
+
loss = None
|
| 328 |
+
logits = logits if get_logits else None
|
| 329 |
+
router_logits = (
|
| 330 |
+
torch.stack(router_logits, dim=0) if len(router_logits) > 0 else None
|
| 331 |
+
)
|
| 332 |
+
# return {
|
| 333 |
+
# "logits": logits,
|
| 334 |
+
# "loss": loss,
|
| 335 |
+
# "aux_losses": aux_losses,
|
| 336 |
+
# "router_logits": router_logits,
|
| 337 |
+
# }
|
| 338 |
+
return Output(logits = logits, loss = loss, aux_losses = aux_losses, router_logits = router_logits)
|
| 339 |
+
|
| 340 |
+
def crop_sequence_length(self, sequence_length):
|
| 341 |
+
# model surgery to decrease the block size if necessary
|
| 342 |
+
# e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
|
| 343 |
+
# but want to use a smaller block size for some smaller, simpler model
|
| 344 |
+
assert sequence_length <= self.config.sequence_length
|
| 345 |
+
self.config.sequence_length = sequence_length
|
| 346 |
+
self.transformer.wpe.weight = nn.Parameter(
|
| 347 |
+
self.transformer.wpe.weight[:sequence_length]
|
| 348 |
+
)
|
| 349 |
+
for block in self.transformer.h:
|
| 350 |
+
block.attn.bias = block.attn.bias[:, :, :sequence_length, :sequence_length]
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def get_parameter_group_specs(self):
|
| 354 |
+
"""
|
| 355 |
+
This long function is unfortunately doing something very simple and is being very defensive:
|
| 356 |
+
We are separating out all parameters of the model into two buckets: those that will experience
|
| 357 |
+
weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
|
| 358 |
+
We are then returning the PyTorch optimizer object.
|
| 359 |
+
"""
|
| 360 |
+
|
| 361 |
+
# separate out all parameters to those that will and won't experience regularizing weight decay
|
| 362 |
+
decay = set()
|
| 363 |
+
no_decay = set()
|
| 364 |
+
whitelist_weight_modules = (torch.nn.Linear,)
|
| 365 |
+
|
| 366 |
+
BLACKLIST_WEIGHT_MODULES = (
|
| 367 |
+
torch.nn.LayerNorm,
|
| 368 |
+
LayerNorm,
|
| 369 |
+
torch.nn.Embedding,
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
for mn, m in self.named_modules():
|
| 373 |
+
for pn, p in m.named_parameters():
|
| 374 |
+
fpn = "%s.%s" % (mn, pn) if mn else pn # full param name
|
| 375 |
+
# random note: because named_modules and named_parameters are recursive
|
| 376 |
+
# we will see the same tensors p many many times. but doing it this way
|
| 377 |
+
# allows us to know which parent module any tensor p belongs to...
|
| 378 |
+
if pn.endswith("bias"):
|
| 379 |
+
# all biases will not be decayed
|
| 380 |
+
no_decay.add(fpn)
|
| 381 |
+
elif pn.endswith("weight") and isinstance(m, whitelist_weight_modules):
|
| 382 |
+
# weights of whitelist modules will be weight decayed
|
| 383 |
+
decay.add(fpn)
|
| 384 |
+
elif pn.endswith("weight") and isinstance(m, BLACKLIST_WEIGHT_MODULES):
|
| 385 |
+
# weights of blacklist modules will NOT be weight decayed
|
| 386 |
+
no_decay.add(fpn)
|
| 387 |
+
|
| 388 |
+
# subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
|
| 389 |
+
# will appear in the no_decay and decay sets respectively after the above.
|
| 390 |
+
# In addition, because named_parameters() doesn't return duplicates, it
|
| 391 |
+
# will only return the first occurence, key'd by 'transformer.wte.weight', below.
|
| 392 |
+
# so let's manually remove 'lm_head.weight' from decay set. This will include
|
| 393 |
+
# this tensor into optimization via transformer.wte.weight only, and not decayed.
|
| 394 |
+
decay.remove("lm_head.weight")
|
| 395 |
+
|
| 396 |
+
# validate that we considered every parameter
|
| 397 |
+
param_dict = {pn: p for pn, p in self.named_parameters()}
|
| 398 |
+
inter_params = decay & no_decay
|
| 399 |
+
union_params = decay | no_decay
|
| 400 |
+
assert (
|
| 401 |
+
len(inter_params) == 0
|
| 402 |
+
), "parameters %s made it into both decay/no_decay sets!" % (str(inter_params),)
|
| 403 |
+
assert (
|
| 404 |
+
len(param_dict.keys() - union_params) == 0
|
| 405 |
+
), "parameters %s were not separated into either decay/no_decay set!" % (
|
| 406 |
+
str(param_dict.keys() - union_params),
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
# create the pytorch optimizer object
|
| 410 |
+
return [
|
| 411 |
+
{"params": sorted(list(decay))},
|
| 412 |
+
{"params": sorted(list(no_decay)), "weight_decay": 0.0},
|
| 413 |
+
]
|
| 414 |
+
|
| 415 |
+
@torch.no_grad()
|
| 416 |
+
def generate(self, input_ids, max_new_tokens, date = None, temperature=1.0, top_k=None):
|
| 417 |
+
"""
|
| 418 |
+
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
|
| 419 |
+
the sequence max_new_tokens times, feeding the predictions back into the model each time.
|
| 420 |
+
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
|
| 421 |
+
"""
|
| 422 |
+
idx = input_ids
|
| 423 |
+
for _ in range(max_new_tokens):
|
| 424 |
+
# if the sequence context is growing too long we must crop it at sequence_length
|
| 425 |
+
idx_cond = (
|
| 426 |
+
idx
|
| 427 |
+
if idx.size(1) <= self.config.sequence_length
|
| 428 |
+
else idx[:, -self.config.sequence_length :]
|
| 429 |
+
)
|
| 430 |
+
# forward the model to get the logits for the index in the sequence
|
| 431 |
+
logits = self(idx_cond, date, get_logits=True).logits
|
| 432 |
+
# pluck the logits at the final step and scale by desired temperature
|
| 433 |
+
logits = logits[:, -1, :] / temperature
|
| 434 |
+
# optionally crop the logits to only the top k options
|
| 435 |
+
if top_k is not None:
|
| 436 |
+
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
| 437 |
+
logits[logits < v[:, [-1]]] = -float("Inf")
|
| 438 |
+
# apply softmax to convert logits to (normalized) probabilities
|
| 439 |
+
probs = F.softmax(logits, dim=-1)
|
| 440 |
+
# sample from the distribution
|
| 441 |
+
idx_next = torch.multinomial(probs, num_samples=1)
|
| 442 |
+
# append sampled index to the running sequence and continue
|
| 443 |
+
idx = torch.cat((idx, idx_next), dim=1)
|
| 444 |
+
# check if we hit the end of the sequence
|
| 445 |
+
if idx_next.item() == self.tokenizer.eot_token:
|
| 446 |
+
break
|
| 447 |
+
|
| 448 |
+
return idx
|
| 449 |
+
|
| 450 |
+
@torch.no_grad()
|
| 451 |
+
def generate_from_string(self, in_str, max_new_tokens, date = None, temperature=1.0, top_k=None):
|
| 452 |
+
idx = (
|
| 453 |
+
torch.tensor(
|
| 454 |
+
self.tokenizer.encode(in_str, allowed_special={"<|endoftext|>"})
|
| 455 |
+
)
|
| 456 |
+
.view(1, -1)
|
| 457 |
+
.to(self.lm_head.weight.device)
|
| 458 |
+
)
|
| 459 |
+
out_idx = (
|
| 460 |
+
self.generate(idx, max_new_tokens, date, temperature, top_k)
|
| 461 |
+
.view(-1)
|
| 462 |
+
.to("cpu")
|
| 463 |
+
.numpy()
|
| 464 |
+
)
|
| 465 |
+
return self.tokenizer.decode(out_idx).split(in_str)[-1]
|
moe.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Simple MoE routing implementations that replace the MLP block in a standard transformer.
|
| 3 |
+
References:
|
| 4 |
+
1) Mistral Source for Mixtral MoEs:
|
| 5 |
+
https://github.com/mistralai/mistral-src
|
| 6 |
+
2) ST-MoE:
|
| 7 |
+
https://arxiv.org/abs/2202.08906
|
| 8 |
+
3) Our notepad of MoE resources:
|
| 9 |
+
https://docs.google.com/document/d/1NuQ5jr7V-Jv1ui7p4KrxO_JTz-7bpYcYMmh49EeJ-QA/edit?usp=sharing
|
| 10 |
+
"""
|
| 11 |
+
import numpy as np
|
| 12 |
+
import torch
|
| 13 |
+
import torch.nn as nn
|
| 14 |
+
import torch.nn.functional as F
|
| 15 |
+
import bisect
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class MoE(nn.Module):
|
| 20 |
+
"""
|
| 21 |
+
Simplest MoE implementation with a linear router and softmax over experts.
|
| 22 |
+
|
| 23 |
+
Note that in this implementation, we simply loop over the experts and
|
| 24 |
+
aggregate the results. This is not the most efficient way to do it, but
|
| 25 |
+
it also avoids the large memory overhead _and_ has no token dropping
|
| 26 |
+
(because we do not need the capacity factor).
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(self, config, mlp):
|
| 30 |
+
super().__init__()
|
| 31 |
+
assert config.moe_num_experts > 0
|
| 32 |
+
self.experts = nn.ModuleList(
|
| 33 |
+
[mlp(config=config) for _ in range(config.moe_num_experts)]
|
| 34 |
+
)
|
| 35 |
+
self.router = nn.Linear(config.n_embd, config.moe_num_experts, bias=False)
|
| 36 |
+
self.top_k = config.moe_num_experts_per_tok
|
| 37 |
+
self.softmax_order = config.moe_softmax_order
|
| 38 |
+
|
| 39 |
+
def forward(self, inputs: torch.Tensor):
|
| 40 |
+
# [batch_size * sequence_length, n_embd]
|
| 41 |
+
inputs_squashed = inputs.view(-1, inputs.shape[-1])
|
| 42 |
+
# [batch_size * sequence_length, num_experts]
|
| 43 |
+
router_logits = self.router(inputs_squashed)
|
| 44 |
+
|
| 45 |
+
# note that selected experts will be the same for all orders:
|
| 46 |
+
# softmax doesnt change top-k, but the weights are different
|
| 47 |
+
if self.softmax_order == "softmax_topk":
|
| 48 |
+
all_probs = F.softmax(router_logits, dim=1)
|
| 49 |
+
weights, selected_experts = torch.topk(all_probs, self.top_k)
|
| 50 |
+
elif self.softmax_order == "topk_softmax":
|
| 51 |
+
weights, selected_experts = torch.topk(router_logits, self.top_k)
|
| 52 |
+
weights = F.softmax(weights, dim=-1)
|
| 53 |
+
else:
|
| 54 |
+
raise ValueError(f"Unknown softmax_order: {self.softmax_order}")
|
| 55 |
+
|
| 56 |
+
results = torch.zeros_like(inputs_squashed)
|
| 57 |
+
# naive looping over experts
|
| 58 |
+
for i, expert in enumerate(self.experts):
|
| 59 |
+
batch_idx, nth_expert = torch.where(selected_experts == i)
|
| 60 |
+
output, _ = expert(inputs_squashed[batch_idx])
|
| 61 |
+
results[batch_idx] += weights[batch_idx, nth_expert, None] * output
|
| 62 |
+
|
| 63 |
+
# return results and router logits (for aux loss calculation later)
|
| 64 |
+
return results.view_as(inputs), {
|
| 65 |
+
"router_logits": router_logits,
|
| 66 |
+
"selected_experts": selected_experts,
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class DummyExpert(nn.Module):
|
| 71 |
+
def __init__(self, output_size: int):
|
| 72 |
+
super().__init__()
|
| 73 |
+
self._output_size = output_size
|
| 74 |
+
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
|
| 75 |
+
out = torch.zeros((self._output_size,), device=inputs.device)
|
| 76 |
+
return out, {}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class MaskedMoE(MoE):
|
| 81 |
+
def __init__(self, config, mlp):
|
| 82 |
+
super().__init__(config, mlp)
|
| 83 |
+
self._sequence_length = config.sequence_length
|
| 84 |
+
self.experts.append(DummyExpert(config.n_embd))
|
| 85 |
+
self.router = nn.Linear(config.n_embd, config.moe_num_experts+1, bias=False)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def forward(self, inputs: torch.Tensor, mask: torch.Tensor):
|
| 89 |
+
seq_len = inputs.shape[1]
|
| 90 |
+
inputs_squashed = inputs.view(-1, inputs.shape[-1])
|
| 91 |
+
router_logits = self.router(inputs_squashed)
|
| 92 |
+
mask = torch.cat(
|
| 93 |
+
(mask, torch.ones((mask.shape[0], 1), device=mask.device)),
|
| 94 |
+
dim=1
|
| 95 |
+
)
|
| 96 |
+
mask = mask.repeat_interleave(seq_len, dim=0)
|
| 97 |
+
router_logits = router_logits*mask
|
| 98 |
+
|
| 99 |
+
# note that selected experts will be the same for all orders:
|
| 100 |
+
# softmax doesnt change top-k, but the weights are different
|
| 101 |
+
if self.softmax_order == "softmax_topk":
|
| 102 |
+
all_probs = F.softmax(router_logits, dim=1)
|
| 103 |
+
weights, selected_experts = torch.topk(all_probs, self.top_k)
|
| 104 |
+
elif self.softmax_order == "topk_softmax":
|
| 105 |
+
weights, selected_experts = torch.topk(router_logits, self.top_k)
|
| 106 |
+
weights = F.softmax(weights, dim=-1)
|
| 107 |
+
else:
|
| 108 |
+
raise ValueError(f"Unknown softmax_order: {self.softmax_order}")
|
| 109 |
+
|
| 110 |
+
results = torch.zeros_like(inputs_squashed)
|
| 111 |
+
# naive looping over experts
|
| 112 |
+
for i, expert in enumerate(self.experts):
|
| 113 |
+
batch_idx, nth_expert = torch.where(selected_experts == i)
|
| 114 |
+
output, _ = expert(inputs_squashed[batch_idx])
|
| 115 |
+
results[batch_idx] += weights[batch_idx, nth_expert, None] * output
|
| 116 |
+
|
| 117 |
+
# return results and router logits (for aux loss calculation later)
|
| 118 |
+
return results.view_as(inputs), {
|
| 119 |
+
"router_logits": router_logits,
|
| 120 |
+
"selected_experts": selected_experts,
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class TimeDependantMoE(nn.Module):
|
| 125 |
+
def __init__(self, config, mlp):
|
| 126 |
+
super().__init__()
|
| 127 |
+
self._num_experts = config.moe_num_experts
|
| 128 |
+
self._mask_moe = MaskedMoE(config, mlp)
|
| 129 |
+
|
| 130 |
+
def forward(self, x, date):
|
| 131 |
+
mask_date = torch.zeros(x.shape[0], self._num_experts).to(x.device)
|
| 132 |
+
range_tensor = torch.arange(self._num_experts).unsqueeze(0).to(x.device)
|
| 133 |
+
mask_date = (range_tensor < date.unsqueeze(1)).float()
|
| 134 |
+
return self._mask_moe(x, mask_date)
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"unk_token": "<|endoftext|>"
|
| 5 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 19 |
+
"unk_token": "<|endoftext|>"
|
| 20 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-04-10T08:09:40.545238228Z","level":"INFO","msg":"using version","core version":"0.19.1"}
|
| 2 |
+
{"time":"2025-04-10T08:09:40.545267808Z","level":"INFO","msg":"created symlink","path":"/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-core.log"}
|
| 3 |
+
{"time":"2025-04-10T08:09:40.655286034Z","level":"INFO","msg":"created new stream","id":"pqshro55"}
|
| 4 |
+
{"time":"2025-04-10T08:09:40.655321987Z","level":"INFO","msg":"stream: started","id":"pqshro55"}
|
| 5 |
+
{"time":"2025-04-10T08:09:40.655358183Z","level":"INFO","msg":"sender: started","stream_id":"pqshro55"}
|
| 6 |
+
{"time":"2025-04-10T08:09:40.655363065Z","level":"INFO","msg":"writer: Do: started","stream_id":"pqshro55"}
|
| 7 |
+
{"time":"2025-04-10T08:09:40.655386983Z","level":"INFO","msg":"handler: started","stream_id":"pqshro55"}
|
| 8 |
+
{"time":"2025-04-10T08:09:40.92678924Z","level":"INFO","msg":"Starting system monitor"}
|
wandb/debug.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-10 08:09:40,528 INFO MainThread:18854 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
|
| 2 |
+
2025-04-10 08:09:40,528 INFO MainThread:18854 [wandb_setup.py:_flush():68] Configure stats pid to 18854
|
| 3 |
+
2025-04-10 08:09:40,529 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/.config/wandb/settings
|
| 4 |
+
2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/settings
|
| 5 |
+
2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
| 6 |
+
2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_init.py:_log_setup():528] Logging user logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug.log
|
| 7 |
+
2025-04-10 08:09:40,531 INFO MainThread:18854 [wandb_init.py:_log_setup():529] Logging internal logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-internal.log
|
| 8 |
+
2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():644] calling init triggers
|
| 9 |
+
2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {}
|
| 11 |
+
2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():680] starting backend
|
| 12 |
+
2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():684] sending inform_init request
|
| 13 |
+
2025-04-10 08:09:40,538 INFO MainThread:18854 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-04-10 08:09:40,539 INFO MainThread:18854 [wandb_init.py:init():697] backend started and connected
|
| 15 |
+
2025-04-10 08:09:40,540 INFO MainThread:18854 [wandb_init.py:init():790] updated telemetry
|
| 16 |
+
2025-04-10 08:09:40,553 INFO MainThread:18854 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-04-10 08:09:40,912 INFO MainThread:18854 [wandb_init.py:init():874] starting run threads in backend
|
| 18 |
+
2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_console_start():2374] atexit reg
|
| 19 |
+
2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_redirect():2224] redirect: wrap_raw
|
| 20 |
+
2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_redirect():2289] Wrapping output streams.
|
| 21 |
+
2025-04-10 08:09:41,235 INFO MainThread:18854 [wandb_run.py:_redirect():2314] Redirects installed.
|
| 22 |
+
2025-04-10 08:09:41,238 INFO MainThread:18854 [wandb_init.py:init():916] run started, returning control to user process
|
wandb/run-20250410_080613-kly9kjv7/files/config.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.1
|
| 4 |
+
m: []
|
| 5 |
+
python_version: 3.10.16
|
| 6 |
+
t:
|
| 7 |
+
"1":
|
| 8 |
+
- 1
|
| 9 |
+
- 5
|
| 10 |
+
- 11
|
| 11 |
+
- 41
|
| 12 |
+
- 49
|
| 13 |
+
- 51
|
| 14 |
+
- 53
|
| 15 |
+
- 55
|
| 16 |
+
- 71
|
| 17 |
+
- 98
|
| 18 |
+
- 100
|
| 19 |
+
"2":
|
| 20 |
+
- 1
|
| 21 |
+
- 5
|
| 22 |
+
- 11
|
| 23 |
+
- 41
|
| 24 |
+
- 49
|
| 25 |
+
- 51
|
| 26 |
+
- 53
|
| 27 |
+
- 55
|
| 28 |
+
- 71
|
| 29 |
+
- 98
|
| 30 |
+
- 100
|
| 31 |
+
"3":
|
| 32 |
+
- 13
|
| 33 |
+
- 23
|
| 34 |
+
- 55
|
| 35 |
+
"4": 3.10.16
|
| 36 |
+
"5": 0.19.1
|
| 37 |
+
"6": 4.51.0
|
| 38 |
+
"8":
|
| 39 |
+
- 5
|
| 40 |
+
"12": 0.19.1
|
| 41 |
+
"13": linux-x86_64
|
wandb/run-20250410_080613-kly9kjv7/files/output.log
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-10:08:06:24 INFO [__main__:422] Selected Tasks: ['commonsense_qa', 'hellaswag', 'lambada', 'openbookqa', 'sciq']
|
| 2 |
+
2025-04-10:08:06:24 INFO [evaluator:180] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
|
| 3 |
+
2025-04-10:08:06:24 INFO [evaluator:218] Initializing hf model, with arguments: {'pretrained': 'robinfaro/GPT2-1B-base', 'trust_remote_code': True, 'force_download': True}
|
| 4 |
+
2025-04-10:08:06:24 INFO [models.huggingface:136] Using device 'cuda:0'
|
| 5 |
+
config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.95MB/s]
|
| 6 |
+
configuration.py: 100%|██████████████████████████████████████████████████████████████████████████████| 1.65k/1.65k [00:00<00:00, 5.80MB/s]
|
| 7 |
+
2025-04-10:08:06:25 INFO [models.huggingface:504] Model type cannot be determined. Using default model type 'causal'
|
| 8 |
+
2025-04-10:08:06:26 INFO [models.huggingface:377] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
|
| 9 |
+
config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 1.60MB/s]
|
| 10 |
+
config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.83MB/s]
|
| 11 |
+
configuration.py: 100%|██████████████████████████████████████████████████████████████████████████████| 1.65k/1.65k [00:00<00:00, 3.04MB/s]
|
| 12 |
+
config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.81MB/s]
|
| 13 |
+
modeling.py: 100%|███████████████████████████████████████████████████████████████████████████████████| 19.6k/19.6k [00:00<00:00, 31.7MB/s]
|
| 14 |
+
aux_losses.py: 100%|█████████████████████████████████████████████████████████████████████████████████| 3.15k/3.15k [00:00<00:00, 6.84MB/s]
|
| 15 |
+
moe.py: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5.39k/5.39k [00:00<00:00, 12.2MB/s]
|
| 16 |
+
model.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████| 6.23G/6.23G [00:19<00:00, 316MB/s]
|
| 17 |
+
Some weights of MoEGPTForCausalLM were not initialized from the model checkpoint at robinfaro/GPT2-1B-base and are newly initialized: ['transformer.wte.weight']
|
| 18 |
+
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
|
| 19 |
+
generation_config.json: 100%|███████████████████████████████████████████████████████████████████████████| 69.0/69.0 [00:00<00:00, 157kB/s]
|
| 20 |
+
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/commonsense_qa.py'
|
| 21 |
+
2025-04-10:08:07:11 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/commonsense_qa.py'
|
| 22 |
+
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/.huggingface.yaml'
|
| 23 |
+
2025-04-10:08:07:12 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/.huggingface.yaml'
|
| 24 |
+
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/dataset_infos.json'
|
| 25 |
+
2025-04-10:08:07:13 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--tau--commonsense_qa/.no_exist/94630fe30dad47192a8546eb75f094926d47e155/dataset_infos.json'
|
| 26 |
+
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/openbookqa.py'
|
| 27 |
+
2025-04-10:08:07:29 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/openbookqa.py'
|
| 28 |
+
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/.huggingface.yaml'
|
| 29 |
+
2025-04-10:08:07:29 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/.huggingface.yaml'
|
| 30 |
+
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/dataset_infos.json'
|
| 31 |
+
2025-04-10:08:07:32 ERROR [huggingface_hub.file_download:1497] Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/mloscratch/hf_cache/hub/datasets--openbookqa/.no_exist/388097ea7776314e93a529163e0fea805b8a6454/dataset_infos.json'
|
| 32 |
+
2025-04-10:08:07:38 INFO [api.task:426] Building contexts for sciq on rank 0...
|
| 33 |
+
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 846.01it/s]
|
| 34 |
+
2025-04-10:08:07:39 INFO [api.task:426] Building contexts for openbookqa on rank 0...
|
| 35 |
+
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 2241.62it/s]
|
| 36 |
+
2025-04-10:08:07:39 INFO [api.task:426] Building contexts for lambada_openai on rank 0...
|
| 37 |
+
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5153/5153 [00:08<00:00, 614.90it/s]
|
| 38 |
+
2025-04-10:08:07:48 INFO [api.task:426] Building contexts for lambada_standard on rank 0...
|
| 39 |
+
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5153/5153 [00:08<00:00, 612.52it/s]
|
| 40 |
+
2025-04-10:08:07:56 INFO [api.task:426] Building contexts for hellaswag on rank 0...
|
| 41 |
+
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 10042/10042 [00:03<00:00, 2514.18it/s]
|
| 42 |
+
2025-04-10:08:08:01 INFO [api.task:426] Building contexts for commonsense_qa on rank 0...
|
| 43 |
+
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1221/1221 [00:02<00:00, 574.82it/s]
|
| 44 |
+
2025-04-10:08:08:04 INFO [evaluator:542] Running loglikelihood requests
|
| 45 |
+
Traceback (most recent call last):
|
| 46 |
+
File "/mloscratch/homes/faro/conda/envs/thesis/bin/lm_eval", line 8, in <module>
|
| 47 |
+
sys.exit(cli_evaluate())
|
| 48 |
+
File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/__main__.py", line 432, in cli_evaluate
|
| 49 |
+
results = evaluator.simple_evaluate(
|
| 50 |
+
File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/utils.py", line 439, in _wrapper
|
| 51 |
+
return fn(*args, **kwargs)
|
| 52 |
+
File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/evaluator.py", line 333, in simple_evaluate
|
| 53 |
+
results = evaluate(
|
| 54 |
+
File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/utils.py", line 439, in _wrapper
|
| 55 |
+
return fn(*args, **kwargs)
|
| 56 |
+
File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/evaluator.py", line 553, in evaluate
|
| 57 |
+
resps = getattr(lm, reqtype)(cloned_reqs)
|
| 58 |
+
File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/api/model.py", line 378, in loglikelihood
|
| 59 |
+
context_enc, continuation_enc = self._encode_pair(context, continuation)
|
| 60 |
+
File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/api/model.py", line 359, in _encode_pair
|
| 61 |
+
context_enc = self.tok_encode(context)
|
| 62 |
+
File "/mloscratch/homes/faro/thesis/lm-evaluation-harness/lm_eval/models/huggingface.py", line 811, in tok_encode
|
| 63 |
+
encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
|
| 64 |
+
File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2654, in encode
|
| 65 |
+
encoded_inputs = self.encode_plus(
|
| 66 |
+
File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3073, in encode_plus
|
| 67 |
+
return self._encode_plus(
|
| 68 |
+
File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py", line 126, in _encode_plus
|
| 69 |
+
return super()._encode_plus(*args, **kwargs)
|
| 70 |
+
File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py", line 613, in _encode_plus
|
| 71 |
+
batched_output = self._batch_encode_plus(
|
| 72 |
+
File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py", line 116, in _batch_encode_plus
|
| 73 |
+
return super()._batch_encode_plus(*args, **kwargs)
|
| 74 |
+
File "/mloscratch/homes/faro/conda/envs/thesis/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py", line 539, in _batch_encode_plus
|
| 75 |
+
encodings = self._tokenizer.encode_batch(
|
| 76 |
+
KeyboardInterrupt
|
wandb/run-20250410_080613-kly9kjv7/files/requirements.txt
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wcwidth==0.2.13
|
| 2 |
+
pure_eval==0.2.3
|
| 3 |
+
ptyprocess==0.7.0
|
| 4 |
+
traitlets==5.14.3
|
| 5 |
+
tornado==6.4.1
|
| 6 |
+
pyzmq==26.2.0
|
| 7 |
+
Pygments==2.18.0
|
| 8 |
+
psutil==6.0.0
|
| 9 |
+
prompt_toolkit==3.0.47
|
| 10 |
+
platformdirs==4.3.6
|
| 11 |
+
pexpect==4.9.0
|
| 12 |
+
parso==0.8.4
|
| 13 |
+
nest-asyncio==1.6.0
|
| 14 |
+
executing==2.1.0
|
| 15 |
+
exceptiongroup==1.2.2
|
| 16 |
+
decorator==5.1.1
|
| 17 |
+
debugpy==1.8.5
|
| 18 |
+
matplotlib-inline==0.1.7
|
| 19 |
+
jupyter_core==5.7.2
|
| 20 |
+
jedi==0.19.1
|
| 21 |
+
comm==0.2.2
|
| 22 |
+
asttokens==2.4.1
|
| 23 |
+
stack-data==0.6.3
|
| 24 |
+
jupyter_client==8.6.3
|
| 25 |
+
ipython==8.27.0
|
| 26 |
+
ipykernel==6.29.5
|
| 27 |
+
mpmath==1.3.0
|
| 28 |
+
MarkupSafe==2.1.5
|
| 29 |
+
Jinja2==3.1.4
|
| 30 |
+
wheel==0.45.1
|
| 31 |
+
asttokens==3.0.0
|
| 32 |
+
debugpy==1.8.13
|
| 33 |
+
decorator==5.2.1
|
| 34 |
+
exceptiongroup==1.2.2
|
| 35 |
+
executing==2.1.0
|
| 36 |
+
nest_asyncio==1.6.0
|
| 37 |
+
packaging==24.2
|
| 38 |
+
parso==0.8.4
|
| 39 |
+
pickleshare==0.7.5
|
| 40 |
+
platformdirs==4.3.6
|
| 41 |
+
psutil==7.0.0
|
| 42 |
+
ptyprocess==0.7.0
|
| 43 |
+
pure_eval==0.2.3
|
| 44 |
+
Pygments==2.19.1
|
| 45 |
+
setuptools==75.8.2
|
| 46 |
+
six==1.17.0
|
| 47 |
+
tornado==6.4.2
|
| 48 |
+
traitlets==5.14.3
|
| 49 |
+
typing_extensions==4.12.2
|
| 50 |
+
wcwidth==0.2.13
|
| 51 |
+
zipp==3.21.0
|
| 52 |
+
comm==0.2.2
|
| 53 |
+
importlib_metadata==8.6.1
|
| 54 |
+
jedi==0.19.2
|
| 55 |
+
jupyter_core==5.7.2
|
| 56 |
+
matplotlib-inline==0.1.7
|
| 57 |
+
pexpect==4.9.0
|
| 58 |
+
pip==25.0.1
|
| 59 |
+
prompt_toolkit==3.0.50
|
| 60 |
+
python-dateutil==2.9.0.post0
|
| 61 |
+
pyzmq==26.2.1
|
| 62 |
+
stack_data==0.6.3
|
| 63 |
+
ipython==8.33.0
|
| 64 |
+
jupyter_client==8.6.3
|
| 65 |
+
ipykernel==6.29.5
|
| 66 |
+
pytz==2025.1
|
| 67 |
+
lit==18.1.8
|
| 68 |
+
xxhash==3.5.0
|
| 69 |
+
urllib3==2.3.0
|
| 70 |
+
tzdata==2025.1
|
| 71 |
+
tqdm==4.67.1
|
| 72 |
+
smmap==5.0.2
|
| 73 |
+
setproctitle==1.3.5
|
| 74 |
+
regex==2024.11.6
|
| 75 |
+
PyYAML==6.0.2
|
| 76 |
+
pydantic_core==2.27.2
|
| 77 |
+
pyarrow==19.0.1
|
| 78 |
+
protobuf==5.29.3
|
| 79 |
+
propcache==0.3.0
|
| 80 |
+
nvidia-nvtx-cu11==11.7.91
|
| 81 |
+
nvidia-nccl-cu11==2.14.3
|
| 82 |
+
nvidia-curand-cu11==10.2.10.91
|
| 83 |
+
nvidia-cufft-cu11==10.9.0.58
|
| 84 |
+
nvidia-cuda-runtime-cu11==11.7.99
|
| 85 |
+
nvidia-cuda-nvrtc-cu11==11.7.99
|
| 86 |
+
nvidia-cuda-cupti-cu11==11.7.101
|
| 87 |
+
nvidia-cublas-cu11==11.10.3.66
|
| 88 |
+
numpy==1.26.4
|
| 89 |
+
networkx==3.4.2
|
| 90 |
+
multidict==6.1.0
|
| 91 |
+
idna==3.10
|
| 92 |
+
fsspec==2024.9.0
|
| 93 |
+
frozenlist==1.5.0
|
| 94 |
+
filelock==3.17.0
|
| 95 |
+
docker-pycreds==0.4.0
|
| 96 |
+
dill==0.3.8
|
| 97 |
+
cmake==3.31.6
|
| 98 |
+
click==8.1.8
|
| 99 |
+
charset-normalizer==3.4.1
|
| 100 |
+
certifi==2025.1.31
|
| 101 |
+
attrs==25.1.0
|
| 102 |
+
async-timeout==5.0.1
|
| 103 |
+
annotated-types==0.7.0
|
| 104 |
+
aiohappyeyeballs==2.4.8
|
| 105 |
+
yarl==1.18.3
|
| 106 |
+
sentry-sdk==2.22.0
|
| 107 |
+
requests==2.32.3
|
| 108 |
+
pydantic==2.10.6
|
| 109 |
+
pandas==2.2.3
|
| 110 |
+
nvidia-cusolver-cu11==11.4.0.1
|
| 111 |
+
nvidia-cudnn-cu11==8.5.0.96
|
| 112 |
+
multiprocess==0.70.16
|
| 113 |
+
gitdb==4.0.12
|
| 114 |
+
aiosignal==1.3.2
|
| 115 |
+
tiktoken==0.8.0
|
| 116 |
+
GitPython==3.1.44
|
| 117 |
+
aiohttp==3.11.13
|
| 118 |
+
wandb==0.19.1
|
| 119 |
+
datasets==3.1.0
|
| 120 |
+
nvidia-cusparse-cu11==11.7.5.86
|
| 121 |
+
triton==3.2.0
|
| 122 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 123 |
+
sympy==1.13.1
|
| 124 |
+
nvidia-nvtx-cu12==12.4.127
|
| 125 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 126 |
+
nvidia-nccl-cu12==2.21.5
|
| 127 |
+
nvidia-curand-cu12==10.3.5.147
|
| 128 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 129 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 130 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 131 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 132 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 133 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 134 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 135 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 136 |
+
torch==2.6.0
|
| 137 |
+
jmespath==1.0.1
|
| 138 |
+
botocore==1.37.8
|
| 139 |
+
s3transfer==0.11.4
|
| 140 |
+
boto3==1.37.8
|
| 141 |
+
asciitree==0.3.3
|
| 142 |
+
numcodecs==0.13.1
|
| 143 |
+
fasteners==0.19
|
| 144 |
+
zarr==2.18.3
|
| 145 |
+
widgetsnbextension==4.0.13
|
| 146 |
+
jupyterlab_widgets==3.0.13
|
| 147 |
+
ipywidgets==8.1.5
|
| 148 |
+
pyparsing==3.2.2
|
| 149 |
+
pillow==11.1.0
|
| 150 |
+
kiwisolver==1.4.8
|
| 151 |
+
fonttools==4.56.0
|
| 152 |
+
cycler==0.12.1
|
| 153 |
+
contourpy==1.3.1
|
| 154 |
+
matplotlib==3.10.1
|
| 155 |
+
safetensors==0.5.3
|
| 156 |
+
torchvision==0.21.0
|
| 157 |
+
timm==1.0.15
|
| 158 |
+
word2number==1.1
|
| 159 |
+
sqlitedict==2.1.0
|
| 160 |
+
zstandard==0.23.0
|
| 161 |
+
threadpoolctl==3.6.0
|
| 162 |
+
tcolorpy==0.1.7
|
| 163 |
+
tabulate==0.9.0
|
| 164 |
+
scipy==1.15.2
|
| 165 |
+
pybind11==2.13.6
|
| 166 |
+
portalocker==3.1.1
|
| 167 |
+
pathvalidate==3.2.3
|
| 168 |
+
numexpr==2.10.2
|
| 169 |
+
more-itertools==10.6.0
|
| 170 |
+
lxml==5.3.2
|
| 171 |
+
jsonlines==4.0.0
|
| 172 |
+
joblib==1.4.2
|
| 173 |
+
colorama==0.4.6
|
| 174 |
+
chardet==5.2.0
|
| 175 |
+
absl-py==2.2.2
|
| 176 |
+
tqdm-multiprocess==0.0.11
|
| 177 |
+
scikit-learn==1.6.1
|
| 178 |
+
sacrebleu==2.5.1
|
| 179 |
+
nltk==3.9.1
|
| 180 |
+
mbstrdecoder==1.1.4
|
| 181 |
+
huggingface-hub==0.30.1
|
| 182 |
+
typepy==1.3.4
|
| 183 |
+
tokenizers==0.21.1
|
| 184 |
+
rouge_score==0.1.2
|
| 185 |
+
transformers==4.51.0
|
| 186 |
+
accelerate==1.6.0
|
| 187 |
+
peft==0.15.1
|
| 188 |
+
DataProperty==1.1.0
|
| 189 |
+
tabledata==1.3.4
|
| 190 |
+
evaluate==0.4.3
|
| 191 |
+
pytablewriter==1.2.1
|
| 192 |
+
lm_eval==0.4.8
|
| 193 |
+
autocommand==2.2.2
|
| 194 |
+
backports.tarfile==1.2.0
|
| 195 |
+
importlib_metadata==8.0.0
|
| 196 |
+
inflect==7.3.1
|
| 197 |
+
jaraco.collections==5.1.0
|
| 198 |
+
jaraco.context==5.3.0
|
| 199 |
+
jaraco.functools==4.0.1
|
| 200 |
+
jaraco.text==3.12.1
|
| 201 |
+
more-itertools==10.3.0
|
| 202 |
+
packaging==24.2
|
| 203 |
+
platformdirs==4.2.2
|
| 204 |
+
tomli==2.0.1
|
| 205 |
+
typeguard==4.3.0
|
| 206 |
+
typing_extensions==4.12.2
|
| 207 |
+
wheel==0.43.0
|
| 208 |
+
zipp==3.19.2
|
wandb/run-20250410_080613-kly9kjv7/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.5.0-45-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.16",
|
| 4 |
+
"startedAt": "2025-04-10T08:06:13.632140Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--model",
|
| 7 |
+
"hf",
|
| 8 |
+
"--model_args",
|
| 9 |
+
"pretrained=robinfaro/GPT2-1B-base,trust_remote_code=True,force_download=True",
|
| 10 |
+
"--tasks",
|
| 11 |
+
"commonsense_qa,openbookqa,hellaswag,lambada,sciq",
|
| 12 |
+
"--device",
|
| 13 |
+
"cuda:0",
|
| 14 |
+
"--batch_size",
|
| 15 |
+
"32",
|
| 16 |
+
"--output_path",
|
| 17 |
+
"outputs/evaluation/base_GPT",
|
| 18 |
+
"--wandb_args",
|
| 19 |
+
"project=lm-evaluation,name=base_GPT_intial_weights",
|
| 20 |
+
"--log_samples"
|
| 21 |
+
],
|
| 22 |
+
"program": "/mloscratch/homes/faro/conda/envs/thesis/bin/lm_eval",
|
| 23 |
+
"git": {
|
| 24 |
+
"remote": "https://github.com/robinfaro/time-moe.git",
|
| 25 |
+
"commit": "209a56c7746e576430987b33efaad3213c829355"
|
| 26 |
+
},
|
| 27 |
+
"email": "robin.faro@epfl.ch",
|
| 28 |
+
"root": "/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling",
|
| 29 |
+
"host": "interact-0-0",
|
| 30 |
+
"executable": "/mloscratch/homes/faro/conda/envs/thesis/bin/python3.10",
|
| 31 |
+
"cpu_count": 36,
|
| 32 |
+
"cpu_count_logical": 72,
|
| 33 |
+
"gpu": "Tesla V100-SXM2-32GB",
|
| 34 |
+
"gpu_count": 1,
|
| 35 |
+
"disk": {
|
| 36 |
+
"/": {
|
| 37 |
+
"total": "6399114346496",
|
| 38 |
+
"used": "4521100476416"
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"memory": {
|
| 42 |
+
"total": "404270809088"
|
| 43 |
+
},
|
| 44 |
+
"cpu": {
|
| 45 |
+
"count": 36,
|
| 46 |
+
"countLogical": 72
|
| 47 |
+
},
|
| 48 |
+
"gpu_nvidia": [
|
| 49 |
+
{
|
| 50 |
+
"name": "Tesla V100-SXM2-32GB",
|
| 51 |
+
"memoryTotal": "34359738368",
|
| 52 |
+
"cudaCores": 5120,
|
| 53 |
+
"architecture": "Volta"
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"cudaVersion": "12.4"
|
| 57 |
+
}
|
wandb/run-20250410_080613-kly9kjv7/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":128}}
|
wandb/run-20250410_080613-kly9kjv7/logs/debug-core.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-04-10T08:06:13.116700026Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp0n5clybj/port-17759.txt","pid":17759,"debug":false,"disable-analytics":false}
|
| 2 |
+
{"time":"2025-04-10T08:06:13.116732345Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
| 3 |
+
{"time":"2025-04-10T08:06:13.117250294Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":17759}
|
| 4 |
+
{"time":"2025-04-10T08:06:13.117259873Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44099,"Zone":""}}
|
| 5 |
+
{"time":"2025-04-10T08:06:13.301381011Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:41864"}
|
| 6 |
+
{"time":"2025-04-10T08:06:13.632552814Z","level":"INFO","msg":"handleInformInit: received","streamId":"kly9kjv7","id":"127.0.0.1:41864"}
|
| 7 |
+
{"time":"2025-04-10T08:06:13.753061117Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"kly9kjv7","id":"127.0.0.1:41864"}
|
| 8 |
+
{"time":"2025-04-10T08:08:22.003469384Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:41864"}
|
| 9 |
+
{"time":"2025-04-10T08:08:22.003813875Z","level":"INFO","msg":"server is shutting down"}
|
| 10 |
+
{"time":"2025-04-10T08:08:22.003802722Z","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:41864"}
|
| 11 |
+
{"time":"2025-04-10T08:08:22.003987056Z","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:41864"}
|
| 12 |
+
{"time":"2025-04-10T08:08:22.938316834Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:41864"}
|
| 13 |
+
{"time":"2025-04-10T08:08:22.938355013Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:41864"}
|
| 14 |
+
{"time":"2025-04-10T08:08:22.938384806Z","level":"INFO","msg":"server is closed"}
|
wandb/run-20250410_080613-kly9kjv7/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-04-10T08:06:13.634947325Z","level":"INFO","msg":"using version","core version":"0.19.1"}
|
| 2 |
+
{"time":"2025-04-10T08:06:13.634964986Z","level":"INFO","msg":"created symlink","path":"/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080613-kly9kjv7/logs/debug-core.log"}
|
| 3 |
+
{"time":"2025-04-10T08:06:13.752969451Z","level":"INFO","msg":"created new stream","id":"kly9kjv7"}
|
| 4 |
+
{"time":"2025-04-10T08:06:13.753045008Z","level":"INFO","msg":"stream: started","id":"kly9kjv7"}
|
| 5 |
+
{"time":"2025-04-10T08:06:13.753098809Z","level":"INFO","msg":"handler: started","stream_id":"kly9kjv7"}
|
| 6 |
+
{"time":"2025-04-10T08:06:13.753089038Z","level":"INFO","msg":"writer: Do: started","stream_id":"kly9kjv7"}
|
| 7 |
+
{"time":"2025-04-10T08:06:13.75312827Z","level":"INFO","msg":"sender: started","stream_id":"kly9kjv7"}
|
| 8 |
+
{"time":"2025-04-10T08:06:14.101026755Z","level":"INFO","msg":"Starting system monitor"}
|
| 9 |
+
{"time":"2025-04-10T08:08:22.003825637Z","level":"INFO","msg":"stream: closing","id":"kly9kjv7"}
|
| 10 |
+
{"time":"2025-04-10T08:08:22.003921768Z","level":"INFO","msg":"Stopping system monitor"}
|
| 11 |
+
{"time":"2025-04-10T08:08:22.004950039Z","level":"INFO","msg":"Stopped system monitor"}
|
| 12 |
+
{"time":"2025-04-10T08:08:22.715736266Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 13 |
+
{"time":"2025-04-10T08:08:22.936723989Z","level":"INFO","msg":"handler: closed","stream_id":"kly9kjv7"}
|
| 14 |
+
{"time":"2025-04-10T08:08:22.93681582Z","level":"INFO","msg":"sender: closed","stream_id":"kly9kjv7"}
|
| 15 |
+
{"time":"2025-04-10T08:08:22.936820808Z","level":"INFO","msg":"writer: Close: closed","stream_id":"kly9kjv7"}
|
| 16 |
+
{"time":"2025-04-10T08:08:22.938157108Z","level":"INFO","msg":"stream: closed","id":"kly9kjv7"}
|
wandb/run-20250410_080613-kly9kjv7/logs/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-10 08:06:13,618 INFO MainThread:17759 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
|
| 2 |
+
2025-04-10 08:06:13,619 INFO MainThread:17759 [wandb_setup.py:_flush():68] Configure stats pid to 17759
|
| 3 |
+
2025-04-10 08:06:13,619 INFO MainThread:17759 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/.config/wandb/settings
|
| 4 |
+
2025-04-10 08:06:13,619 INFO MainThread:17759 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/settings
|
| 5 |
+
2025-04-10 08:06:13,619 INFO MainThread:17759 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
| 6 |
+
2025-04-10 08:06:13,620 INFO MainThread:17759 [wandb_init.py:_log_setup():528] Logging user logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080613-kly9kjv7/logs/debug.log
|
| 7 |
+
2025-04-10 08:06:13,620 INFO MainThread:17759 [wandb_init.py:_log_setup():529] Logging internal logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080613-kly9kjv7/logs/debug-internal.log
|
| 8 |
+
2025-04-10 08:06:13,620 INFO MainThread:17759 [wandb_init.py:init():644] calling init triggers
|
| 9 |
+
2025-04-10 08:06:13,621 INFO MainThread:17759 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {}
|
| 11 |
+
2025-04-10 08:06:13,621 INFO MainThread:17759 [wandb_init.py:init():680] starting backend
|
| 12 |
+
2025-04-10 08:06:13,621 INFO MainThread:17759 [wandb_init.py:init():684] sending inform_init request
|
| 13 |
+
2025-04-10 08:06:13,630 INFO MainThread:17759 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-04-10 08:06:13,631 INFO MainThread:17759 [wandb_init.py:init():697] backend started and connected
|
| 15 |
+
2025-04-10 08:06:13,634 INFO MainThread:17759 [wandb_init.py:init():790] updated telemetry
|
| 16 |
+
2025-04-10 08:06:13,651 INFO MainThread:17759 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-04-10 08:06:14,086 INFO MainThread:17759 [wandb_init.py:init():874] starting run threads in backend
|
| 18 |
+
2025-04-10 08:06:14,446 INFO MainThread:17759 [wandb_run.py:_console_start():2374] atexit reg
|
| 19 |
+
2025-04-10 08:06:14,446 INFO MainThread:17759 [wandb_run.py:_redirect():2224] redirect: wrap_raw
|
| 20 |
+
2025-04-10 08:06:14,446 INFO MainThread:17759 [wandb_run.py:_redirect():2289] Wrapping output streams.
|
| 21 |
+
2025-04-10 08:06:14,447 INFO MainThread:17759 [wandb_run.py:_redirect():2314] Redirects installed.
|
| 22 |
+
2025-04-10 08:06:14,450 INFO MainThread:17759 [wandb_init.py:init():916] run started, returning control to user process
|
| 23 |
+
2025-04-10 08:08:22,004 WARNING MsgRouterThr:17759 [router.py:message_loop():75] message_loop has been closed
|
wandb/run-20250410_080613-kly9kjv7/run-kly9kjv7.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3470404bd4c37d163a54c1a86cb7beac6b443ebd05b979578d2951589ecbc317
|
| 3 |
+
size 185481
|
wandb/run-20250410_080940-pqshro55/files/output.log
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-10:08:09:50 INFO [__main__:422] Selected Tasks: ['commonsense_qa', 'hellaswag', 'lambada', 'openbookqa', 'sciq']
|
| 2 |
+
2025-04-10:08:09:50 INFO [evaluator:180] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
|
| 3 |
+
2025-04-10:08:09:50 INFO [evaluator:218] Initializing hf model, with arguments: {'pretrained': 'robinfaro/GPT2-1B-base', 'trust_remote_code': True, 'force_download': True}
|
| 4 |
+
2025-04-10:08:09:50 INFO [models.huggingface:136] Using device 'cuda:0'
|
| 5 |
+
2025-04-10:08:09:51 INFO [models.huggingface:504] Model type cannot be determined. Using default model type 'causal'
|
| 6 |
+
2025-04-10:08:09:52 INFO [models.huggingface:377] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
|
| 7 |
+
config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 2.01MB/s]
|
| 8 |
+
config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 1.73MB/s]
|
| 9 |
+
configuration.py: 100%|██████████████████████████████████████████████████████████████████████████████| 1.65k/1.65k [00:00<00:00, 3.86MB/s]
|
| 10 |
+
config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:00<00:00, 1.80MB/s]
|
| 11 |
+
modeling.py: 100%|███████████████████████████████████████████████████████████████████████████████████| 19.6k/19.6k [00:00<00:00, 32.6MB/s]
|
| 12 |
+
moe.py: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5.39k/5.39k [00:00<00:00, 12.5MB/s]
|
| 13 |
+
aux_losses.py: 100%|█████████████████████████████████████████████████████████████████████████████████| 3.15k/3.15k [00:00<00:00, 7.68MB/s]
|
| 14 |
+
model.safetensors.index.json: 100%|██████████████████████████████████████████████████████████████████| 22.2k/22.2k [00:00<00:00, 15.7MB/s]
|
| 15 |
+
model-00002-of-00002.safetensors: 100%|██████████████████████████████████████████████████████████████| 1.55G/1.55G [00:30<00:00, 50.3MB/s]
|
| 16 |
+
model-00001-of-00002.safetensors: 29%|██████████████████▏ | 1.47G/5.00G [00:30<01:39, 35.5MB/s]
|
| 17 |
+
model-00001-of-00002.safetensors: 83%|███████████████████████████████████████████████████▊ | 4.17G/5.00G [02:14<00:44, 18.5MB/s]
|
wandb/run-20250410_080940-pqshro55/files/requirements.txt
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wcwidth==0.2.13
|
| 2 |
+
pure_eval==0.2.3
|
| 3 |
+
ptyprocess==0.7.0
|
| 4 |
+
traitlets==5.14.3
|
| 5 |
+
tornado==6.4.1
|
| 6 |
+
pyzmq==26.2.0
|
| 7 |
+
Pygments==2.18.0
|
| 8 |
+
psutil==6.0.0
|
| 9 |
+
prompt_toolkit==3.0.47
|
| 10 |
+
platformdirs==4.3.6
|
| 11 |
+
pexpect==4.9.0
|
| 12 |
+
parso==0.8.4
|
| 13 |
+
nest-asyncio==1.6.0
|
| 14 |
+
executing==2.1.0
|
| 15 |
+
exceptiongroup==1.2.2
|
| 16 |
+
decorator==5.1.1
|
| 17 |
+
debugpy==1.8.5
|
| 18 |
+
matplotlib-inline==0.1.7
|
| 19 |
+
jupyter_core==5.7.2
|
| 20 |
+
jedi==0.19.1
|
| 21 |
+
comm==0.2.2
|
| 22 |
+
asttokens==2.4.1
|
| 23 |
+
stack-data==0.6.3
|
| 24 |
+
jupyter_client==8.6.3
|
| 25 |
+
ipython==8.27.0
|
| 26 |
+
ipykernel==6.29.5
|
| 27 |
+
mpmath==1.3.0
|
| 28 |
+
MarkupSafe==2.1.5
|
| 29 |
+
Jinja2==3.1.4
|
| 30 |
+
wheel==0.45.1
|
| 31 |
+
asttokens==3.0.0
|
| 32 |
+
debugpy==1.8.13
|
| 33 |
+
decorator==5.2.1
|
| 34 |
+
exceptiongroup==1.2.2
|
| 35 |
+
executing==2.1.0
|
| 36 |
+
nest_asyncio==1.6.0
|
| 37 |
+
packaging==24.2
|
| 38 |
+
parso==0.8.4
|
| 39 |
+
pickleshare==0.7.5
|
| 40 |
+
platformdirs==4.3.6
|
| 41 |
+
psutil==7.0.0
|
| 42 |
+
ptyprocess==0.7.0
|
| 43 |
+
pure_eval==0.2.3
|
| 44 |
+
Pygments==2.19.1
|
| 45 |
+
setuptools==75.8.2
|
| 46 |
+
six==1.17.0
|
| 47 |
+
tornado==6.4.2
|
| 48 |
+
traitlets==5.14.3
|
| 49 |
+
typing_extensions==4.12.2
|
| 50 |
+
wcwidth==0.2.13
|
| 51 |
+
zipp==3.21.0
|
| 52 |
+
comm==0.2.2
|
| 53 |
+
importlib_metadata==8.6.1
|
| 54 |
+
jedi==0.19.2
|
| 55 |
+
jupyter_core==5.7.2
|
| 56 |
+
matplotlib-inline==0.1.7
|
| 57 |
+
pexpect==4.9.0
|
| 58 |
+
pip==25.0.1
|
| 59 |
+
prompt_toolkit==3.0.50
|
| 60 |
+
python-dateutil==2.9.0.post0
|
| 61 |
+
pyzmq==26.2.1
|
| 62 |
+
stack_data==0.6.3
|
| 63 |
+
ipython==8.33.0
|
| 64 |
+
jupyter_client==8.6.3
|
| 65 |
+
ipykernel==6.29.5
|
| 66 |
+
pytz==2025.1
|
| 67 |
+
lit==18.1.8
|
| 68 |
+
xxhash==3.5.0
|
| 69 |
+
urllib3==2.3.0
|
| 70 |
+
tzdata==2025.1
|
| 71 |
+
tqdm==4.67.1
|
| 72 |
+
smmap==5.0.2
|
| 73 |
+
setproctitle==1.3.5
|
| 74 |
+
regex==2024.11.6
|
| 75 |
+
PyYAML==6.0.2
|
| 76 |
+
pydantic_core==2.27.2
|
| 77 |
+
pyarrow==19.0.1
|
| 78 |
+
protobuf==5.29.3
|
| 79 |
+
propcache==0.3.0
|
| 80 |
+
nvidia-nvtx-cu11==11.7.91
|
| 81 |
+
nvidia-nccl-cu11==2.14.3
|
| 82 |
+
nvidia-curand-cu11==10.2.10.91
|
| 83 |
+
nvidia-cufft-cu11==10.9.0.58
|
| 84 |
+
nvidia-cuda-runtime-cu11==11.7.99
|
| 85 |
+
nvidia-cuda-nvrtc-cu11==11.7.99
|
| 86 |
+
nvidia-cuda-cupti-cu11==11.7.101
|
| 87 |
+
nvidia-cublas-cu11==11.10.3.66
|
| 88 |
+
numpy==1.26.4
|
| 89 |
+
networkx==3.4.2
|
| 90 |
+
multidict==6.1.0
|
| 91 |
+
idna==3.10
|
| 92 |
+
fsspec==2024.9.0
|
| 93 |
+
frozenlist==1.5.0
|
| 94 |
+
filelock==3.17.0
|
| 95 |
+
docker-pycreds==0.4.0
|
| 96 |
+
dill==0.3.8
|
| 97 |
+
cmake==3.31.6
|
| 98 |
+
click==8.1.8
|
| 99 |
+
charset-normalizer==3.4.1
|
| 100 |
+
certifi==2025.1.31
|
| 101 |
+
attrs==25.1.0
|
| 102 |
+
async-timeout==5.0.1
|
| 103 |
+
annotated-types==0.7.0
|
| 104 |
+
aiohappyeyeballs==2.4.8
|
| 105 |
+
yarl==1.18.3
|
| 106 |
+
sentry-sdk==2.22.0
|
| 107 |
+
requests==2.32.3
|
| 108 |
+
pydantic==2.10.6
|
| 109 |
+
pandas==2.2.3
|
| 110 |
+
nvidia-cusolver-cu11==11.4.0.1
|
| 111 |
+
nvidia-cudnn-cu11==8.5.0.96
|
| 112 |
+
multiprocess==0.70.16
|
| 113 |
+
gitdb==4.0.12
|
| 114 |
+
aiosignal==1.3.2
|
| 115 |
+
tiktoken==0.8.0
|
| 116 |
+
GitPython==3.1.44
|
| 117 |
+
aiohttp==3.11.13
|
| 118 |
+
wandb==0.19.1
|
| 119 |
+
datasets==3.1.0
|
| 120 |
+
nvidia-cusparse-cu11==11.7.5.86
|
| 121 |
+
triton==3.2.0
|
| 122 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 123 |
+
sympy==1.13.1
|
| 124 |
+
nvidia-nvtx-cu12==12.4.127
|
| 125 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 126 |
+
nvidia-nccl-cu12==2.21.5
|
| 127 |
+
nvidia-curand-cu12==10.3.5.147
|
| 128 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 129 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 130 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 131 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 132 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 133 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 134 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 135 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 136 |
+
torch==2.6.0
|
| 137 |
+
jmespath==1.0.1
|
| 138 |
+
botocore==1.37.8
|
| 139 |
+
s3transfer==0.11.4
|
| 140 |
+
boto3==1.37.8
|
| 141 |
+
asciitree==0.3.3
|
| 142 |
+
numcodecs==0.13.1
|
| 143 |
+
fasteners==0.19
|
| 144 |
+
zarr==2.18.3
|
| 145 |
+
widgetsnbextension==4.0.13
|
| 146 |
+
jupyterlab_widgets==3.0.13
|
| 147 |
+
ipywidgets==8.1.5
|
| 148 |
+
pyparsing==3.2.2
|
| 149 |
+
pillow==11.1.0
|
| 150 |
+
kiwisolver==1.4.8
|
| 151 |
+
fonttools==4.56.0
|
| 152 |
+
cycler==0.12.1
|
| 153 |
+
contourpy==1.3.1
|
| 154 |
+
matplotlib==3.10.1
|
| 155 |
+
safetensors==0.5.3
|
| 156 |
+
torchvision==0.21.0
|
| 157 |
+
timm==1.0.15
|
| 158 |
+
word2number==1.1
|
| 159 |
+
sqlitedict==2.1.0
|
| 160 |
+
zstandard==0.23.0
|
| 161 |
+
threadpoolctl==3.6.0
|
| 162 |
+
tcolorpy==0.1.7
|
| 163 |
+
tabulate==0.9.0
|
| 164 |
+
scipy==1.15.2
|
| 165 |
+
pybind11==2.13.6
|
| 166 |
+
portalocker==3.1.1
|
| 167 |
+
pathvalidate==3.2.3
|
| 168 |
+
numexpr==2.10.2
|
| 169 |
+
more-itertools==10.6.0
|
| 170 |
+
lxml==5.3.2
|
| 171 |
+
jsonlines==4.0.0
|
| 172 |
+
joblib==1.4.2
|
| 173 |
+
colorama==0.4.6
|
| 174 |
+
chardet==5.2.0
|
| 175 |
+
absl-py==2.2.2
|
| 176 |
+
tqdm-multiprocess==0.0.11
|
| 177 |
+
scikit-learn==1.6.1
|
| 178 |
+
sacrebleu==2.5.1
|
| 179 |
+
nltk==3.9.1
|
| 180 |
+
mbstrdecoder==1.1.4
|
| 181 |
+
huggingface-hub==0.30.1
|
| 182 |
+
typepy==1.3.4
|
| 183 |
+
tokenizers==0.21.1
|
| 184 |
+
rouge_score==0.1.2
|
| 185 |
+
transformers==4.51.0
|
| 186 |
+
accelerate==1.6.0
|
| 187 |
+
peft==0.15.1
|
| 188 |
+
DataProperty==1.1.0
|
| 189 |
+
tabledata==1.3.4
|
| 190 |
+
evaluate==0.4.3
|
| 191 |
+
pytablewriter==1.2.1
|
| 192 |
+
lm_eval==0.4.8
|
| 193 |
+
autocommand==2.2.2
|
| 194 |
+
backports.tarfile==1.2.0
|
| 195 |
+
importlib_metadata==8.0.0
|
| 196 |
+
inflect==7.3.1
|
| 197 |
+
jaraco.collections==5.1.0
|
| 198 |
+
jaraco.context==5.3.0
|
| 199 |
+
jaraco.functools==4.0.1
|
| 200 |
+
jaraco.text==3.12.1
|
| 201 |
+
more-itertools==10.3.0
|
| 202 |
+
packaging==24.2
|
| 203 |
+
platformdirs==4.2.2
|
| 204 |
+
tomli==2.0.1
|
| 205 |
+
typeguard==4.3.0
|
| 206 |
+
typing_extensions==4.12.2
|
| 207 |
+
wheel==0.43.0
|
| 208 |
+
zipp==3.19.2
|
wandb/run-20250410_080940-pqshro55/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.5.0-45-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.16",
|
| 4 |
+
"startedAt": "2025-04-10T08:09:40.539738Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--model",
|
| 7 |
+
"hf",
|
| 8 |
+
"--model_args",
|
| 9 |
+
"pretrained=robinfaro/GPT2-1B-base,trust_remote_code=True,force_download=True",
|
| 10 |
+
"--tasks",
|
| 11 |
+
"commonsense_qa,openbookqa,hellaswag,lambada,sciq",
|
| 12 |
+
"--device",
|
| 13 |
+
"cuda:0",
|
| 14 |
+
"--batch_size",
|
| 15 |
+
"32",
|
| 16 |
+
"--output_path",
|
| 17 |
+
"outputs/evaluation/base_GPT",
|
| 18 |
+
"--wandb_args",
|
| 19 |
+
"project=lm-evaluation,name=base_GPT_intial_weights",
|
| 20 |
+
"--log_samples"
|
| 21 |
+
],
|
| 22 |
+
"program": "/mloscratch/homes/faro/conda/envs/thesis/bin/lm_eval",
|
| 23 |
+
"git": {
|
| 24 |
+
"remote": "https://github.com/robinfaro/time-moe.git",
|
| 25 |
+
"commit": "209a56c7746e576430987b33efaad3213c829355"
|
| 26 |
+
},
|
| 27 |
+
"email": "robin.faro@epfl.ch",
|
| 28 |
+
"root": "/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling",
|
| 29 |
+
"host": "interact-0-0",
|
| 30 |
+
"executable": "/mloscratch/homes/faro/conda/envs/thesis/bin/python3.10",
|
| 31 |
+
"cpu_count": 36,
|
| 32 |
+
"cpu_count_logical": 72,
|
| 33 |
+
"gpu": "Tesla V100-SXM2-32GB",
|
| 34 |
+
"gpu_count": 1,
|
| 35 |
+
"disk": {
|
| 36 |
+
"/": {
|
| 37 |
+
"total": "6399114346496",
|
| 38 |
+
"used": "4521100247040"
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"memory": {
|
| 42 |
+
"total": "404270809088"
|
| 43 |
+
},
|
| 44 |
+
"cpu": {
|
| 45 |
+
"count": 36,
|
| 46 |
+
"countLogical": 72
|
| 47 |
+
},
|
| 48 |
+
"gpu_nvidia": [
|
| 49 |
+
{
|
| 50 |
+
"name": "Tesla V100-SXM2-32GB",
|
| 51 |
+
"memoryTotal": "34359738368",
|
| 52 |
+
"cudaCores": 5120,
|
| 53 |
+
"architecture": "Volta"
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"cudaVersion": "12.4"
|
| 57 |
+
}
|
wandb/run-20250410_080940-pqshro55/logs/debug-core.log
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-04-10T08:09:40.025636927Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp6h_8vy1g/port-18854.txt","pid":18854,"debug":false,"disable-analytics":false}
|
| 2 |
+
{"time":"2025-04-10T08:09:40.025665392Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
| 3 |
+
{"time":"2025-04-10T08:09:40.026203437Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":18854}
|
| 4 |
+
{"time":"2025-04-10T08:09:40.026201522Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":36677,"Zone":""}}
|
| 5 |
+
{"time":"2025-04-10T08:09:40.209295737Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:40920"}
|
| 6 |
+
{"time":"2025-04-10T08:09:40.542511639Z","level":"INFO","msg":"handleInformInit: received","streamId":"pqshro55","id":"127.0.0.1:40920"}
|
| 7 |
+
{"time":"2025-04-10T08:09:40.655328707Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"pqshro55","id":"127.0.0.1:40920"}
|
| 8 |
+
{"time":"2025-04-10T08:12:11.07420118Z","level":"INFO","msg":"Parent process exited, terminating service process."}
|
wandb/run-20250410_080940-pqshro55/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-04-10T08:09:40.545238228Z","level":"INFO","msg":"using version","core version":"0.19.1"}
|
| 2 |
+
{"time":"2025-04-10T08:09:40.545267808Z","level":"INFO","msg":"created symlink","path":"/mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-core.log"}
|
| 3 |
+
{"time":"2025-04-10T08:09:40.655286034Z","level":"INFO","msg":"created new stream","id":"pqshro55"}
|
| 4 |
+
{"time":"2025-04-10T08:09:40.655321987Z","level":"INFO","msg":"stream: started","id":"pqshro55"}
|
| 5 |
+
{"time":"2025-04-10T08:09:40.655358183Z","level":"INFO","msg":"sender: started","stream_id":"pqshro55"}
|
| 6 |
+
{"time":"2025-04-10T08:09:40.655363065Z","level":"INFO","msg":"writer: Do: started","stream_id":"pqshro55"}
|
| 7 |
+
{"time":"2025-04-10T08:09:40.655386983Z","level":"INFO","msg":"handler: started","stream_id":"pqshro55"}
|
| 8 |
+
{"time":"2025-04-10T08:09:40.92678924Z","level":"INFO","msg":"Starting system monitor"}
|
wandb/run-20250410_080940-pqshro55/logs/debug.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-10 08:09:40,528 INFO MainThread:18854 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
|
| 2 |
+
2025-04-10 08:09:40,528 INFO MainThread:18854 [wandb_setup.py:_flush():68] Configure stats pid to 18854
|
| 3 |
+
2025-04-10 08:09:40,529 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/.config/wandb/settings
|
| 4 |
+
2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/settings
|
| 5 |
+
2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
| 6 |
+
2025-04-10 08:09:40,530 INFO MainThread:18854 [wandb_init.py:_log_setup():528] Logging user logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug.log
|
| 7 |
+
2025-04-10 08:09:40,531 INFO MainThread:18854 [wandb_init.py:_log_setup():529] Logging internal logs to /mloscratch/homes/faro/thesis/time-moe/huggingface_modeling/wandb/run-20250410_080940-pqshro55/logs/debug-internal.log
|
| 8 |
+
2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():644] calling init triggers
|
| 9 |
+
2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {}
|
| 11 |
+
2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():680] starting backend
|
| 12 |
+
2025-04-10 08:09:40,532 INFO MainThread:18854 [wandb_init.py:init():684] sending inform_init request
|
| 13 |
+
2025-04-10 08:09:40,538 INFO MainThread:18854 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-04-10 08:09:40,539 INFO MainThread:18854 [wandb_init.py:init():697] backend started and connected
|
| 15 |
+
2025-04-10 08:09:40,540 INFO MainThread:18854 [wandb_init.py:init():790] updated telemetry
|
| 16 |
+
2025-04-10 08:09:40,553 INFO MainThread:18854 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-04-10 08:09:40,912 INFO MainThread:18854 [wandb_init.py:init():874] starting run threads in backend
|
| 18 |
+
2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_console_start():2374] atexit reg
|
| 19 |
+
2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_redirect():2224] redirect: wrap_raw
|
| 20 |
+
2025-04-10 08:09:41,234 INFO MainThread:18854 [wandb_run.py:_redirect():2289] Wrapping output streams.
|
| 21 |
+
2025-04-10 08:09:41,235 INFO MainThread:18854 [wandb_run.py:_redirect():2314] Redirects installed.
|
| 22 |
+
2025-04-10 08:09:41,238 INFO MainThread:18854 [wandb_init.py:init():916] run started, returning control to user process
|
wandb/run-20250410_080940-pqshro55/run-pqshro55.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f21ede142e38539e967b0af5849784cab3e5a00323d4038d8d9a0921dd277b3e
|
| 3 |
+
size 262144
|