overhaul

Browse files

Files changed (3) hide show

myolmoe/config.json +4 -1
myolmoe/modeling_myolmoe.py +145 -14
scripts/train.py +60 -148

myolmoe/config.json CHANGED Viewed

@@ -30,5 +30,8 @@
   "torch_dtype": "float32",
   "transformers_version": "4.52.4",
   "use_cache": true,
-  "vocab_size": 50304
 }

   "torch_dtype": "float32",
   "transformers_version": "4.52.4",
   "use_cache": true,
+  "vocab_size": 50304,
+  "small_expert_intermediate_ratio": 0.5,
+  "small_expert_frequency": 4,
+  "small_expert_load_balancing_coef": 0.1
 }

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -14,7 +14,103 @@ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_u
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 from transformers.utils import logging
-from transformers.models.olmoe.configuration_olmoe import OlmoeConfig
 logger = logging.get_logger(__name__)
@@ -143,21 +239,25 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
 class OlmoeMLP(nn.Module):
-    def __init__(self, config):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
         self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
         down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
@@ -446,6 +546,7 @@ OLMOE_ATTENTION_CLASSES = {
 }
 class OlmoeSparseMoeBlock(nn.Module):
     def __init__(self, config, layer_idx: int):
         super().__init__()
@@ -453,10 +554,21 @@ class OlmoeSparseMoeBlock(nn.Module):
         self.num_experts = config.num_experts
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
-        self.routing_type = getattr(config, "routing_type", "topk")  # default to topk
-        self.n_step = getattr(config, "nth_step", 2)  # used in nth-descending
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
-        self.experts = nn.ModuleList([OlmoeMLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
@@ -464,7 +576,6 @@ class OlmoeSparseMoeBlock(nn.Module):
         router_logits = self.gate(hidden_states)
         routing_probs = F.softmax(router_logits, dim=1, dtype=torch.float)
-        # === Routing  ===
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
         if self.norm_topk_prob:
@@ -479,6 +590,18 @@ class OlmoeSparseMoeBlock(nn.Module):
         expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
             idx, top_x = torch.where(expert_mask[expert_idx])
@@ -489,8 +612,7 @@ class OlmoeSparseMoeBlock(nn.Module):
             final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
         final_hidden_states = final_hidden_states.view(batch_size, sequence_length, hidden_dim)
-        return final_hidden_states, router_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
@@ -536,9 +658,9 @@ class OlmoeDecoderLayer(nn.Module):
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states, router_logits = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        outputs = (hidden_states,)
         if output_attentions:
             outputs += (self_attn_weights,)
         if use_cache:
@@ -942,6 +1064,15 @@ class MyOlmoeForCausalLM(OlmoePreTrainedModel, GenerationMixin):
             if output_router_logits:
                 output = (aux_loss,) + output
             return (loss,) + output if loss is not None else output
         return MoeCausalLMOutputWithPast(
             loss=loss,
             aux_loss=aux_loss,
@@ -952,4 +1083,4 @@ class MyOlmoeForCausalLM(OlmoePreTrainedModel, GenerationMixin):
             router_logits=outputs.router_logits,
         )
-__all__ = ["MyOlmoeForCausalLM", "OlmoeModel", "OlmoePreTrainedModel"]

 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 from transformers.utils import logging
+# from transformers.models.olmoe.configuration_olmoe import OlmoeConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+class OlmoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`OlmoeModel`].
+    [Previous docstring remains the same...]
+    Args:
+        [Previous args remain the same...]
+        small_expert_intermediate_ratio (`float`, *optional*, defaults to 0.5):
+            Ratio of intermediate size for small experts compared to regular experts.
+        small_expert_frequency (`int`, *optional*, defaults to 4):
+            Frequency of small experts - every Nth expert will be small.
+        small_expert_load_balancing_coef (`float`, *optional*, defaults to 0.1):
+            Coefficient for small expert load balancing loss.
+    """
+    model_type = "olmoe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=50304,
+        hidden_size=2048,
+        intermediate_size=2048,
+        num_hidden_layers=16,
+        num_attention_heads=16,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=None,
+        eos_token_id=50279,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        clip_qkv=None,
+        num_experts_per_tok=8,
+        num_experts=64,
+        output_router_logits=False,
+        router_aux_loss_coef=0.01,
+        norm_topk_prob=False,
+        small_expert_intermediate_ratio=0.5,
+        small_expert_frequency=4,
+        small_expert_load_balancing_coef=0.1,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.clip_qkv = clip_qkv
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.norm_topk_prob = norm_topk_prob
+        # Small expert parameters
+        self.small_expert_intermediate_ratio = small_expert_intermediate_ratio
+        self.small_expert_frequency = small_expert_frequency
+        self.small_expert_load_balancing_coef = small_expert_load_balancing_coef
+        # Validate the correctness of rotary position embeddings parameters
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
 logger = logging.get_logger(__name__)
 class OlmoeMLP(nn.Module):
+    def __init__(self, config, is_small=False):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
+        if is_small:
+            self.intermediate_size = int(config.intermediate_size * config.small_expert_intermediate_ratio)
+        else:
+            self.intermediate_size = config.intermediate_size
         self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
+        self.is_small = is_small
     def forward(self, x):
         down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
 }
 class OlmoeSparseMoeBlock(nn.Module):
     def __init__(self, config, layer_idx: int):
         super().__init__()
         self.num_experts = config.num_experts
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
+        self.routing_type = getattr(config, "routing_type", "topk")
+        self.n_step = getattr(config, "nth_step", 2)
+        # Track which experts are small
+        self.small_expert_indices = []
+        self.experts = nn.ModuleList()
+        for i in range(self.num_experts):
+            is_small = (i % config.small_expert_frequency == 0)
+            if is_small:
+                self.small_expert_indices.append(i)
+            self.experts.append(OlmoeMLP(config, is_small=is_small))
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
+        self.small_expert_load_balancing_coef = config.small_expert_load_balancing_coef
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         router_logits = self.gate(hidden_states)
         routing_probs = F.softmax(router_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
         if self.norm_topk_prob:
         expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+        # Calculate small expert load balancing loss
+        small_expert_mask = torch.zeros_like(expert_mask)
+        for idx in self.small_expert_indices:
+            small_expert_mask[idx] = expert_mask[idx]
+        small_expert_loss = load_balancing_loss_func(
+            router_logits,
+            self.num_experts,
+            self.top_k,
+            None
+        ) * self.small_expert_load_balancing_coef
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
             idx, top_x = torch.where(expert_mask[expert_idx])
             final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
         final_hidden_states = final_hidden_states.view(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits, small_expert_loss
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, router_logits, small_expert_loss = self.mlp(hidden_states) #
+        hidden_states = residual + hidden_states #
+        outputs = (hidden_states, small_expert_loss)  #
         if output_attentions:
             outputs += (self_attn_weights,)
         if use_cache:
             if output_router_logits:
                 output = (aux_loss,) + output
             return (loss,) + output if loss is not None else output
+        #
+        total_small_expert_loss = 0
+        for layer_output in outputs:
+            if len(layer_output) > 1 and isinstance(layer_output[1], torch.Tensor):
+                total_small_expert_loss += layer_output[1]
+        if labels is not None:
+            loss += total_small_expert_loss.to(loss.device)
+        #
         return MoeCausalLMOutputWithPast(
             loss=loss,
             aux_loss=aux_loss,
             router_logits=outputs.router_logits,
         )
+__all__ = ["MyOlmoeForCausalLM", "OlmoeModel", "OlmoePreTrainedModel", "OlmoeConfig"]

scripts/train.py CHANGED Viewed

@@ -1,170 +1,82 @@
-# scripts/train_small_experts.py
 import torch
-from transformers import TrainingArguments, Trainer, AutoTokenizer
 from datasets import load_dataset
-from myolmoe.modeling_myolmoe import MyOlmoeForCausalLM, OlmoeConfig
-from torch.utils.data import Dataset
 import os
-from tqdm import tqdm
-class CustomDataset(Dataset):
-    def __init__(self, tokenizer, dataset_name="allenai/tulu-v2-sft-mixture", max_length=512):
-        print(f"# DEBUG: Loading dataset '{dataset_name}' with max length {max_length}")
-        self.dataset = load_dataset(dataset_name, split="train")  # Use train split
-        self.tokenizer = tokenizer
-        self.max_length = max_length
-    def __len__(self):
-        return len(self.dataset)
-    def __getitem__(self, idx):
-        item = self.dataset[idx]
-        text = item["text"]  # Adjust based on your dataset structure
-        encoding = self.tokenizer(
-            text,
-            max_length=self.max_length,
-            padding="max_length",
-            truncation=True,
-            return_tensors="pt"
-        )
-        # DEBUG: Print the first few token IDs for inspection
-        if idx == 0:
-            print(f"# DEBUG: Sample input text: {text[:100]}")
-            print(f"# DEBUG: Tokenized input_ids[:10]: {encoding['input_ids'][0][:10]}")
-        return {
-            "input_ids": encoding["input_ids"].squeeze(),
-            "attention_mask": encoding["attention_mask"].squeeze(),
-            "labels": encoding["input_ids"].squeeze()
-        }
-def expand_model_with_small_experts(base_model):
-    print("# DEBUG: Expanding model with small experts...")
-    config = base_model.config
-    config.num_small_experts = 64  # Add 64 small experts
-    config.small_expert_intermediate_size = config.intermediate_size // 32
-    expanded_model = MyOlmoeForCausalLM(config)
-    base_state_dict = base_model.state_dict()
-    expanded_state_dict = expanded_model.state_dict()
-    print("# DEBUG: Copying non-expert parameters...")
-    for name, param in base_state_dict.items():
-        if "experts" not in name and "gate" not in name:
-            if name in expanded_state_dict:
-                expanded_state_dict[name].copy_(param)
-            else:
-                print(f"# DEBUG: Skipped non-expert param {name} (not found in expanded model)")
-    print("# DEBUG: Copying expert weights...")
-    for i in range(config.num_experts):
-        for proj in ['gate_proj', 'up_proj', 'down_proj']:
-            key = f'model.layers.{i}.mlp.experts.{i}.{proj}.weight'
-            if key in base_state_dict:
-                orig_weight = base_state_dict[key]
-                target_weight = expanded_state_dict[key]
-                if proj == 'down_proj':
-                    # For down_proj, we copy the first part of the input dimension
-                    target_weight.copy_(orig_weight[:, :config.small_expert_intermediate_size])
-                else:
-                    # For gate_proj and up_proj, we copy the first part of the output dimension
-                    target_weight.copy_(orig_weight[:config.small_expert_intermediate_size, :])
-                print(f"# DEBUG: Copied {proj} weights for expert {i} "
-                      f"(original shape: {orig_weight.shape}, new shape: {target_weight.shape})")
-            else:
-                print(f"# DEBUG: Missing {key} in base model")
-    print("# DEBUG: Expanding and initializing gate weights...")
-    for i in range(config.num_hidden_layers):
-        gate_key = f'model.layers.{i}.mlp.gate.weight'
-        if gate_key in base_state_dict:
-            original_gate = base_state_dict[gate_key]
-            new_gate = expanded_state_dict[gate_key]
-            # Copy original gate weights
-            new_gate[:, :config.num_experts].copy_(original_gate)
-            # Initialize small experts gate weights
-            torch.nn.init.normal_(
-                new_gate[:, config.num_experts:],
-                mean=0.0,
-                std=config.initializer_range * 0.1
-            )
-            print(f"# DEBUG: Initialized gate for layer {i} "
-                  f"(original shape: {original_gate.shape}, new shape: {new_gate.shape})")
-        else:
-            print(f"# DEBUG: Missing gate weight {gate_key}")
-    print("# DEBUG: Loading expanded state dict into model...")
-    expanded_model.load_state_dict(expanded_state_dict, strict=False)
-    return expanded_model
-def main():
-    model_path = "myolmoe"
-    print("# DEBUG: Loading base model...")
-    base_model = MyOlmoeForCausalLM.from_pretrained(model_path)
-    print(f"# DEBUG: Base model has {base_model.config.num_experts} experts")
-    print("# DEBUG: Calling expand_model_with_small_experts()...")
-    model = expand_model_with_small_experts(base_model)
-    print(f"# DEBUG: Expanded model has {model.config.num_experts} regular experts and {model.config.num_small_experts} small experts")
-    print("# DEBUG: Loading tokenizer and preparing dataset...")
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    dataset = CustomDataset(tokenizer)
-    print("# DEBUG: Setting up training arguments...")
     training_args = TrainingArguments(
         output_dir="./output",
-        per_device_train_batch_size=4,
         gradient_accumulation_steps=8,
-        learning_rate=1e-4,
-        num_train_epochs=3,
         logging_dir="./logs",
-        save_strategy="steps",
         save_steps=1000,
-        evaluation_strategy="steps",
-        eval_steps=500,
-        fp16=True,
         gradient_checkpointing=True,
-        report_to="tensorboard"
     )
-    class MoETrainer(Trainer):
-        def __init__(self, *args, **kwargs):
-            self.freeze_existing = kwargs.pop('freeze_existing_experts', False)
-            super().__init__(*args, **kwargs)
-            if self.freeze_existing:
-                print("# DEBUG: Freezing original expert parameters...")
-                frozen_count = 0
-                for name, param in self.model.named_parameters():
-                    if "mlp.experts" in name and "small_experts" not in name:
-                        param.requires_grad = False
-                        frozen_count += 1
-                print(f"# DEBUG: Total frozen expert parameters: {frozen_count}")
-    print("# DEBUG: Initializing trainer...")
-    trainer = MoETrainer(
         model=model,
         args=training_args,
-        train_dataset=dataset,
-        eval_dataset=dataset,
-        freeze_existing_experts=True
     )
-    print("# DEBUG: Starting training...")
     trainer.train()
-    output_dir = "./final_model"
-    os.makedirs(output_dir, exist_ok=True)
-    print(f"# DEBUG: Saving final model to {output_dir}...")
-    model.save_pretrained(output_dir)
-    tokenizer.save_pretrained(output_dir)
-    print("# DEBUG: Training complete!")
 if __name__ == "__main__":
-    main()

+#!/usr/bin/env python3
 import torch
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    default_data_collator,
+)
 from datasets import load_dataset
+from myolmoe import MyOlmoeForCausalLM, OlmoeConfig
 import os
+def main():
+    # Load config and model
+    config = OlmoeConfig.from_pretrained("myolmoe/config.json")
+    model = MyOlmoeForCausalLM.from_pretrained(
+        "myolmoe",
+        config=config,
+        torch_dtype=torch.bfloat16,
+        device_map="auto"
+    )
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained("myolmoe")
+    tokenizer.pad_token = tokenizer.eos_token
+    # Load dataset
+    dataset = load_dataset("allenai/tulu-v2-sft-mixture", split="train")
+    def tokenize_function(examples):
+        return tokenizer(
+            examples["text"],
+            truncation=True,
+            max_length=4096,
+            padding="max_length"
+        )
+    tokenized_dataset = dataset.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=dataset.column_names,
+        num_proc=4
+    )
+    # Training arguments
     training_args = TrainingArguments(
         output_dir="./output",
+        per_device_train_batch_size=2,
         gradient_accumulation_steps=8,
+        learning_rate=1e-5,
+        num_train_epochs=1,
         logging_dir="./logs",
+        logging_steps=10,
         save_steps=1000,
+        save_total_limit=2,
+        bf16=True,
         gradient_checkpointing=True,
+        report_to="tensorboard",
+        optim="adamw_torch",
+        lr_scheduler_type="cosine",
+        warmup_ratio=0.1,
+        max_grad_norm=1.0,
     )
+    # Trainer
+    trainer = Trainer(
         model=model,
         args=training_args,
+        train_dataset=tokenized_dataset,
+        tokenizer=tokenizer,
+        data_collator=default_data_collator,
     )
+    # Train
     trainer.train()
+    # Save
+    trainer.save_model("./final_model")
 if __name__ == "__main__":
+    main()