Charlie81
/

LoRE

TensorBoard

Safetensors

Model card Files Files and versions

xet

Metrics Training metrics Community

Charlie81 commited on Jul 6, 2025

Commit

36acce3

1 Parent(s): a82f934

reset modeling file

Browse files

Files changed (2) hide show

myolmoe/modeling_myolmoe.py +7 -52
scripts/train.py +1 -2

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# modeling_myolmoe.py
 import math
 from typing import List, Optional, Tuple, Union
 import torch
@@ -157,21 +156,6 @@ class OlmoeMLP(nn.Module):
     def forward(self, x):
         down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
-class SmallOlmoeMLP(nn.Module):
-    def __init__(self, config, small_expert_intermediate_size):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = small_expert_intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -462,34 +446,17 @@ OLMOE_ATTENTION_CLASSES = {
 }
 class OlmoeSparseMoeBlock(nn.Module):
     def __init__(self, config, layer_idx: int):
         super().__init__()
         self.layer_idx = layer_idx
         self.num_experts = config.num_experts
-        self.num_small_experts = getattr(config, "num_small_experts", 0)  # Default to 0 if not specified
-        self.total_experts = self.num_experts + self.num_small_experts
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
-        self.routing_type = getattr(config, "routing_type", "topk")
-        self.n_step = getattr(config, "nth_step", 2)
-        # Gate now needs to handle both regular and small experts
-        self.gate = nn.Linear(config.hidden_size, self.total_experts, bias=False)
-        # Regular experts
         self.experts = nn.ModuleList([OlmoeMLP(config) for _ in range(self.num_experts)])
-        # Small experts (if any)
-        self.small_experts = nn.ModuleList()
-        if self.num_small_experts > 0:
-            small_expert_intermediate_size = getattr(config, "small_expert_intermediate_size",
-                                                    config.intermediate_size // 2)  # Default to half size
-            self.small_experts = nn.ModuleList([
-                SmallOlmoeMLP(config, small_expert_intermediate_size)
-                for _ in range(self.num_small_experts)
-            ])
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
@@ -497,6 +464,7 @@ class OlmoeSparseMoeBlock(nn.Module):
         router_logits = self.gate(hidden_states)
         routing_probs = F.softmax(router_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
         if self.norm_topk_prob:
@@ -509,9 +477,8 @@ class OlmoeSparseMoeBlock(nn.Module):
             device=hidden_states.device,
         )
-        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.total_experts).permute(2, 1, 0)
-        # Process regular experts
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
             idx, top_x = torch.where(expert_mask[expert_idx])
@@ -521,21 +488,9 @@ class OlmoeSparseMoeBlock(nn.Module):
             current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
             final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
-        # Process small experts
-        for small_expert_idx in range(self.num_small_experts):
-            expert_layer = self.small_experts[small_expert_idx]
-            # Offset by num_experts since small experts come after regular ones
-            global_expert_idx = self.num_experts + small_expert_idx
-            idx, top_x = torch.where(expert_mask[global_expert_idx])
-            if top_x.numel() == 0:
-                continue
-            current_state = hidden_states[top_x]
-            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
-            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
         final_hidden_states = final_hidden_states.view(batch_size, sequence_length, hidden_dim)
         return final_hidden_states, router_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
@@ -997,4 +952,4 @@ class MyOlmoeForCausalLM(OlmoePreTrainedModel, GenerationMixin):
             router_logits=outputs.router_logits,
         )
-__all__ = ["MyOlmoeForCausalLM", "OlmoeModel", "OlmoePreTrainedModel"]

 import math
 from typing import List, Optional, Tuple, Union
 import torch
     def forward(self, x):
         down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 }
 class OlmoeSparseMoeBlock(nn.Module):
     def __init__(self, config, layer_idx: int):
         super().__init__()
         self.layer_idx = layer_idx
         self.num_experts = config.num_experts
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
+        self.routing_type = getattr(config, "routing_type", "topk")  # default to topk
+        self.n_step = getattr(config, "nth_step", 2)  # used in nth-descending
+        self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
         self.experts = nn.ModuleList([OlmoeMLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         router_logits = self.gate(hidden_states)
         routing_probs = F.softmax(router_logits, dim=1, dtype=torch.float)
+        # === Routing  ===
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
         if self.norm_topk_prob:
             device=hidden_states.device,
         )
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
             idx, top_x = torch.where(expert_mask[expert_idx])
             current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
             final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
         final_hidden_states = final_hidden_states.view(batch_size, sequence_length, hidden_dim)
         return final_hidden_states, router_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
             router_logits=outputs.router_logits,
         )
+__all__ = ["MyOlmoeForCausalLM", "OlmoeModel", "OlmoePreTrainedModel"]

scripts/train.py CHANGED Viewed

@@ -41,8 +41,7 @@ def expand_model_with_small_experts(base_model):
     print("# DEBUG: Expanding model with small experts...")
     config = base_model.config
     config.num_small_experts = 64  # Add 64 small experts
-    # Changed from //16 to //2 for more reasonable size
-    config.small_expert_intermediate_size = config.intermediate_size // 2
     expanded_model = MyOlmoeForCausalLM(config)
     base_state_dict = base_model.state_dict()

     print("# DEBUG: Expanding model with small experts...")
     config = base_model.config
     config.num_small_experts = 64  # Add 64 small experts
+    config.small_expert_intermediate_size = config.intermediate_size // 32
     expanded_model = MyOlmoeForCausalLM(config)
     base_state_dict = base_model.state_dict()