initial stuff

Browse files

Files changed (3) hide show

myolmoe/config.json +4 -2
myolmoe/modeling_myolmoe.py +51 -5
scripts/train.py +85 -0

myolmoe/config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "architectures": [
-    "OlmoeForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
@@ -15,6 +15,8 @@
   "norm_topk_prob": false,
   "num_attention_heads": 16,
   "num_experts": 64,
   "num_experts_per_tok": 2,
   "num_hidden_layers": 16,
   "num_key_value_heads": 16,
@@ -29,4 +31,4 @@
   "transformers_version": "4.52.4",
   "use_cache": true,
   "vocab_size": 50304
-}

 {
   "architectures": [
+    "MyOlmoeForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "norm_topk_prob": false,
   "num_attention_heads": 16,
   "num_experts": 64,
+  "num_small_experts": 64,
+  "small_expert_intermediate_size": 512,
   "num_experts_per_tok": 2,
   "num_hidden_layers": 16,
   "num_key_value_heads": 16,
   "transformers_version": "4.52.4",
   "use_cache": true,
   "vocab_size": 50304
+}

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -156,6 +156,21 @@ class OlmoeMLP(nn.Module):
     def forward(self, x):
         down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -446,15 +461,34 @@ OLMOE_ATTENTION_CLASSES = {
 }
 class OlmoeSparseMoeBlock(nn.Module):
     def __init__(self, config, layer_idx: int):
         super().__init__()
         self.layer_idx = layer_idx
         self.num_experts = config.num_experts
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
-        self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
         self.experts = nn.ModuleList([OlmoeMLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
@@ -462,7 +496,6 @@ class OlmoeSparseMoeBlock(nn.Module):
         router_logits = self.gate(hidden_states)
         routing_probs = F.softmax(router_logits, dim=1, dtype=torch.float)
-        # === Routing  ===
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
         if self.norm_topk_prob:
@@ -475,8 +508,9 @@ class OlmoeSparseMoeBlock(nn.Module):
             device=hidden_states.device,
         )
-        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
             idx, top_x = torch.where(expert_mask[expert_idx])
@@ -486,10 +520,22 @@ class OlmoeSparseMoeBlock(nn.Module):
             current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
             final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
         final_hidden_states = final_hidden_states.view(batch_size, sequence_length, hidden_dim)
         return final_hidden_states, router_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
         super().__init__()

     def forward(self, x):
         down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
+class SmallOlmoeMLP(nn.Module):
+    def __init__(self, config, small_expert_intermediate_size):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = small_expert_intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 }
 class OlmoeSparseMoeBlock(nn.Module):
     def __init__(self, config, layer_idx: int):
         super().__init__()
         self.layer_idx = layer_idx
         self.num_experts = config.num_experts
+        self.num_small_experts = getattr(config, "num_small_experts", 0)  # Default to 0 if not specified
+        self.total_experts = self.num_experts + self.num_small_experts
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
+        self.routing_type = getattr(config, "routing_type", "topk")
+        self.n_step = getattr(config, "nth_step", 2)
+        # Gate now needs to handle both regular and small experts
+        self.gate = nn.Linear(config.hidden_size, self.total_experts, bias=False)
+        # Regular experts
         self.experts = nn.ModuleList([OlmoeMLP(config) for _ in range(self.num_experts)])
+        # Small experts (if any)
+        self.small_experts = nn.ModuleList()
+        if self.num_small_experts > 0:
+            small_expert_intermediate_size = getattr(config, "small_expert_intermediate_size",
+                                                    config.intermediate_size // 2)  # Default to half size
+            self.small_experts = nn.ModuleList([
+                SmallOlmoeMLP(config, small_expert_intermediate_size)
+                for _ in range(self.num_small_experts)
+            ])
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         router_logits = self.gate(hidden_states)
         routing_probs = F.softmax(router_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
         if self.norm_topk_prob:
             device=hidden_states.device,
         )
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.total_experts).permute(2, 1, 0)
+        # Process regular experts
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
             idx, top_x = torch.where(expert_mask[expert_idx])
             current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
             final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        # Process small experts
+        for small_expert_idx in range(self.num_small_experts):
+            expert_layer = self.small_experts[small_expert_idx]
+            # Offset by num_experts since small experts come after regular ones
+            global_expert_idx = self.num_experts + small_expert_idx
+            idx, top_x = torch.where(expert_mask[global_expert_idx])
+            if top_x.numel() == 0:
+                continue
+            current_state = hidden_states[top_x]
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
         final_hidden_states = final_hidden_states.view(batch_size, sequence_length, hidden_dim)
         return final_hidden_states, router_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
         super().__init__()

scripts/train.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# scripts/train_small_experts.py
+import torch
+from transformers import TrainingArguments, Trainer
+from datasets import load_dataset
+from myolmoe import MyOlmoeForCausalLM, OlmoeConfig
+from torch.utils.data import Dataset
+class CustomDataset(Dataset):
+    def __init__(self, tokenizer, dataset_name="allenai/tulu-v2-sft-mixture", max_length=512):
+        self.dataset = load_dataset(dataset_name)
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        text = item["text"]  # Adjust based on your dataset structure
+        encoding = self.tokenizer(
+            text,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt"
+        )
+        return {
+            "input_ids": encoding["input_ids"].squeeze(),
+            "attention_mask": encoding["attention_mask"].squeeze(),
+            "labels": encoding["input_ids"].squeeze()
+        }
+def main():
+    # Load base model
+    model_path = "myolmoe"
+    base_model = MyOlmoeForCausalLM.from_pretrained(model_path)
+    # Create new config with small experts
+    config = base_model.config
+    config.num_small_experts = 64  # Add 64 small experts
+    config.small_expert_intermediate_size = 512  # Half the size of regular experts
+    # Initialize new model with same weights but expanded architecture
+    model = MyOlmoeForCausalLM(config)
+    # Copy existing weights
+    model.load_state_dict(base_model.state_dict(), strict=False)
+    # Initialize small experts (they'll start with random weights)
+    # You might want to initialize them differently, perhaps with smaller variance
+    # Prepare dataset
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    dataset = CustomDataset(tokenizer)
+    # Training arguments
+    training_args = TrainingArguments(
+        output_dir="./output",
+        per_device_train_batch_size=4,
+        gradient_accumulation_steps=8,
+        learning_rate=1e-4,
+        num_train_epochs=3,
+        logging_dir="./logs",
+        save_strategy="steps",
+        save_steps=1000,
+        evaluation_strategy="steps",
+        eval_steps=500,
+        fp16=True,
+        gradient_checkpointing=True,
+        report_to="tensorboard"
+    )
+    # Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset,
+        eval_dataset=dataset,  # In practice, use a separate validation set
+    )
+    # Train
+    trainer.train()
+if __name__ == "__main__":
+    main()