Charlie81
/

LoRE

TensorBoard

Safetensors

Model card Files Files and versions

xet

Metrics Training metrics Community

Charlie81 commited on Jul 11, 2025

Commit

834ad70

1 Parent(s): f03d4f8

attempt new distribution experts

Browse files

Files changed (2) hide show

myolmoe/modeling_myolmoe.py +48 -15
scripts/train.py +6 -2

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -65,6 +65,8 @@ class OlmoeConfig(PretrainedConfig):
         small_expert_intermediate_ratio=64,
         small_expert_count=64,
         small_expert_sparsity_coef=0.1,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -98,6 +100,8 @@ class OlmoeConfig(PretrainedConfig):
         self.small_expert_intermediate_ratio = small_expert_intermediate_ratio
         self.small_expert_count = small_expert_count
         self.small_expert_sparsity_coef = small_expert_sparsity_coef
         # Validate the correctness of rotary position embeddings parameters
         if self.rope_scaling is not None and "type" in self.rope_scaling:
@@ -550,28 +554,57 @@ class OlmoeSparseMoeBlock(nn.Module):
     def __init__(self, config, layer_idx: int):
         super().__init__()
         self.layer_idx = layer_idx
         self.num_experts = config.num_experts
-        self.num_small_experts = config.small_expert_count
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
         self.experts = nn.ModuleList([OlmoeMLP(config) for _ in range(self.num_experts)])
-        self.small_experts = nn.ModuleList([OlmoeMLP(config, is_small=True) for _ in range(self.num_small_experts)])
-        # Gates for both expert types
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
-        self.small_gate = nn.Linear(config.hidden_size, self.num_small_experts, bias=False)
         self.small_expert_sparsity_coef = config.small_expert_sparsity_coef
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
-        # Get logits for both expert types
         router_logits = self.gate(hidden_states)
-        small_router_logits = self.small_gate(hidden_states)
-        # Combine logits for routing
-        combined_logits = torch.cat([router_logits, small_router_logits], dim=-1)
         routing_probs = F.softmax(combined_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
@@ -580,27 +613,27 @@ class OlmoeSparseMoeBlock(nn.Module):
         final_hidden_states = torch.zeros_like(hidden_states)
         expert_mask = torch.nn.functional.one_hot(
-            selected_experts,
             num_classes=self.num_experts + self.num_small_experts
         ).permute(2, 1, 0)
-        # Process all experts (regular + small)
         for expert_idx in range(self.num_experts + self.num_small_experts):
             idx, top_x = torch.where(expert_mask[expert_idx])
             if top_x.shape[0] == 0:
                 continue
             if expert_idx < self.num_experts:
                 expert = self.experts[expert_idx]
             else:
                 expert = self.small_experts[expert_idx - self.num_experts]
             current_states = hidden_states[top_x]
             current_output = expert(current_states) * routing_weights[top_x, idx, None]
             final_hidden_states.index_add_(0, top_x, current_output.to(hidden_states.dtype))
         return final_hidden_states.view(batch_size, sequence_length, hidden_dim), combined_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
         super().__init__()

         small_expert_intermediate_ratio=64,
         small_expert_count=64,
         small_expert_sparsity_coef=0.1,
+        small_expert_strategy="constant",  # NEW
+        max_small_expert_count=64,         # NEW: total possible small experts
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.small_expert_intermediate_ratio = small_expert_intermediate_ratio
         self.small_expert_count = small_expert_count
         self.small_expert_sparsity_coef = small_expert_sparsity_coef
+        self.small_expert_strategy = small_expert_strategy
+        self.max_small_expert_count = max_small_expert_count
         # Validate the correctness of rotary position embeddings parameters
         if self.rope_scaling is not None and "type" in self.rope_scaling:
     def __init__(self, config, layer_idx: int):
         super().__init__()
         self.layer_idx = layer_idx
+        self.total_layers = config.num_hidden_layers
         self.num_experts = config.num_experts
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
+        # Determine if this block is in the second half
+        in_second_half = layer_idx >= self.total_layers // 2
+        # Determine small expert count for this layer
+        if in_second_half:
+            second_half_idx = layer_idx - (self.total_layers // 2)
+            num_second_half_blocks = self.total_layers - (self.total_layers // 2)
+            if config.small_expert_strategy == "constant":
+                self.num_small_experts = config.max_small_expert_count // num_second_half_blocks
+            elif config.small_expert_strategy == "increment":
+                # Linearly scale small experts from 1 to max_small_expert_count
+                self.num_small_experts = (
+                    (second_half_idx + 1) * config.max_small_expert_count // ((num_second_half_blocks * (num_second_half_blocks + 1)) // 2)
+                )
+            else:
+                raise ValueError(f"Unknown strategy: {config.small_expert_strategy}")
+        else:
+            self.num_small_experts = 0
         self.experts = nn.ModuleList([OlmoeMLP(config) for _ in range(self.num_experts)])
+        self.small_experts = nn.ModuleList([
+            OlmoeMLP(config, is_small=True) for _ in range(self.num_small_experts)
+        ]) if self.num_small_experts > 0 else None
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
+        if self.num_small_experts > 0:
+            self.small_gate = nn.Linear(config.hidden_size, self.num_small_experts, bias=False)
+        else:
+            self.small_gate = None
         self.small_expert_sparsity_coef = config.small_expert_sparsity_coef
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
+        if self.num_small_experts > 0:
+            small_router_logits = self.small_gate(hidden_states)
+            combined_logits = torch.cat([router_logits, small_router_logits], dim=-1)
+        else:
+            combined_logits = router_logits
         routing_probs = F.softmax(combined_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
         final_hidden_states = torch.zeros_like(hidden_states)
         expert_mask = torch.nn.functional.one_hot(
+            selected_experts,
             num_classes=self.num_experts + self.num_small_experts
         ).permute(2, 1, 0)
         for expert_idx in range(self.num_experts + self.num_small_experts):
             idx, top_x = torch.where(expert_mask[expert_idx])
             if top_x.shape[0] == 0:
                 continue
             if expert_idx < self.num_experts:
                 expert = self.experts[expert_idx]
             else:
                 expert = self.small_experts[expert_idx - self.num_experts]
             current_states = hidden_states[top_x]
             current_output = expert(current_states) * routing_weights[top_x, idx, None]
             final_hidden_states.index_add_(0, top_x, current_output.to(hidden_states.dtype))
         return final_hidden_states.view(batch_size, sequence_length, hidden_dim), combined_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
         super().__init__()

scripts/train.py CHANGED Viewed

@@ -73,7 +73,7 @@ def main():
         per_device_train_batch_size=2,
         gradient_accumulation_steps=8,
         learning_rate=1e-4,
-        num_train_epochs=1,
         logging_dir="./logs",
         logging_steps=10,
         save_steps=1000,
@@ -94,10 +94,14 @@ def main():
     # Unfreeze only the small experts and their gating networks
     trainable_params = []
     for name, param in model.named_parameters():
-        if "mlp.small_experts" in name or "mlp.small_gate" in name:
             param.requires_grad = True
             trainable_params.append(name)
             print(f"Unfreezing parameter: {name}")
     print(f"Total trainable parameters: {len(trainable_params)}")

         per_device_train_batch_size=2,
         gradient_accumulation_steps=8,
         learning_rate=1e-4,
+        num_train_epochs=0.001,
         logging_dir="./logs",
         logging_steps=10,
         save_steps=1000,
     # Unfreeze only the small experts and their gating networks
     trainable_params = []
     for name, param in model.named_parameters():
+        if (
+            "small_experts" in name or
+            "small_gate" in name
+        ):
             param.requires_grad = True
             trainable_params.append(name)
             print(f"Unfreezing parameter: {name}")
     print(f"Total trainable parameters: {len(trainable_params)}")