Charlie81
/

LoRE

TensorBoard

Safetensors

Model card Files Files and versions

xet

Metrics Training metrics Community

Charlie81 commited on Jul 10, 2025

Commit

c306fa9

1 Parent(s): 44c43d7

huge fixes

Browse files

Files changed (2) hide show

myolmoe/config.json +1 -1
myolmoe/modeling_myolmoe.py +65 -74

myolmoe/config.json CHANGED Viewed

@@ -33,5 +33,5 @@
   "vocab_size": 50304,
   "small_expert_intermediate_ratio": 16,
   "small_expert_count": 64,
-  "small_expert_load_balancing_coef": 0.1
 }

   "vocab_size": 50304,
   "small_expert_intermediate_ratio": 16,
   "small_expert_count": 64,
+  "small_expert_sparsity_coef": 0.1
 }

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -29,7 +29,7 @@ class OlmoeConfig(PretrainedConfig):
             Ratio of intermediate size for small experts compared to regular experts.
         small_expert_count (`int`, *optional*, defaults to 64):
             Frequency of small experts - every Nth expert will be small.
-        small_expert_load_balancing_coef (`float`, *optional*, defaults to 0.1):
             Coefficient for small expert load balancing loss.
     """
     model_type = "olmoe"
@@ -64,7 +64,7 @@ class OlmoeConfig(PretrainedConfig):
         norm_topk_prob=False,
         small_expert_intermediate_ratio=64,
         small_expert_count=64,
-        small_expert_load_balancing_coef=0.1,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -97,7 +97,7 @@ class OlmoeConfig(PretrainedConfig):
         # Small expert parameters
         self.small_expert_intermediate_ratio = small_expert_intermediate_ratio
         self.small_expert_count = small_expert_count
-        self.small_expert_load_balancing_coef = small_expert_load_balancing_coef
         # Validate the correctness of rotary position embeddings parameters
         if self.rope_scaling is not None and "type" in self.rope_scaling:
@@ -546,65 +546,60 @@ OLMOE_ATTENTION_CLASSES = {
 }
 class OlmoeSparseMoeBlock(nn.Module):
     def __init__(self, config, layer_idx: int):
         super().__init__()
         self.layer_idx = layer_idx
         self.num_experts = config.num_experts
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
-        self.routing_type = getattr(config, "routing_type", "topk")
-        self.n_step = getattr(config, "nth_step", 2)
-        # Track which experts are small
-        self.small_expert_indices = list(range(config.num_experts - config.small_expert_count, config.num_experts))
-        self.experts = nn.ModuleList()
-        for i in range(self.num_experts):
-            # Small experts are now at the end indices
-            is_small = i in self.small_expert_indices
-            self.experts.append(OlmoeMLP(config, is_small=is_small))
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
-        self.small_expert_load_balancing_coef = config.small_expert_load_balancing_coef
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
-        routing_probs = F.softmax(router_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
         if self.norm_topk_prob:
-            routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
-        routing_weights = routing_weights.to(hidden_states.dtype)
-        final_hidden_states = torch.zeros(
-            (batch_size * sequence_length, hidden_dim),
-            dtype=hidden_states.dtype,
-            device=hidden_states.device,
-        )
-        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
-        # Calculate small expert load balancing loss
-        small_expert_mask = torch.zeros_like(expert_mask)
-        for idx in self.small_expert_indices:
-            small_expert_mask[idx] = expert_mask[idx]
-        for expert_idx in range(self.num_experts):
-            expert_layer = self.experts[expert_idx]
             idx, top_x = torch.where(expert_mask[expert_idx])
-            if top_x.numel() == 0:
                 continue
-            current_state = hidden_states[top_x]
-            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
-            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
-        final_hidden_states = final_hidden_states.view(batch_size, sequence_length, hidden_dim)
-        return final_hidden_states, router_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
@@ -1042,42 +1037,38 @@ class MyOlmoeForCausalLM(OlmoePreTrainedModel, GenerationMixin):
         if labels is not None:
             loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
         #
-        aux_loss = None
-        total_small_expert_loss = torch.tensor(0.0, device=logits.device)
-        if output_router_logits:
-            # Calculate regular load balancing loss
-            aux_loss = load_balancing_loss_func(
-                outputs.router_logits if return_dict else outputs[-1],
-                self.num_experts,
-                self.num_experts_per_tok,
-                attention_mask,
             )
-            # Calculate small expert load balancing loss
-            router_logits = outputs.router_logits if return_dict else outputs[-1]
-            if isinstance(router_logits, tuple):
-                small_expert_mask = torch.zeros_like(router_logits[0])
-                # Create mask for small experts
-                for idx in range(self.config.num_experts - self.config.small_expert_count,
-                               self.config.num_experts):
-                    small_expert_mask = small_expert_mask.scatter(-1, torch.tensor([idx]), 1.0)
-                # Apply mask and calculate loss
-                masked_router_logits = [rl * small_expert_mask for rl in router_logits]
-                total_small_expert_loss = load_balancing_loss_func(
-                    tuple(masked_router_logits),
-                    self.num_experts,
-                    self.num_experts_per_tok,
-                    attention_mask,
-                ) * self.config.small_expert_load_balancing_coef
-        if labels is not None:
-            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
-            if aux_loss is not None:
-                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)
-            if total_small_expert_loss is not None:
-                loss += total_small_expert_loss.to(loss.device)
         #
         return MoeCausalLMOutputWithPast(
             loss=loss,

             Ratio of intermediate size for small experts compared to regular experts.
         small_expert_count (`int`, *optional*, defaults to 64):
             Frequency of small experts - every Nth expert will be small.
+        small_expert_sparsity_coef (`float`, *optional*, defaults to 0.1):
             Coefficient for small expert load balancing loss.
     """
     model_type = "olmoe"
         norm_topk_prob=False,
         small_expert_intermediate_ratio=64,
         small_expert_count=64,
+        small_expert_sparsity_coef=0.1,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         # Small expert parameters
         self.small_expert_intermediate_ratio = small_expert_intermediate_ratio
         self.small_expert_count = small_expert_count
+        self.small_expert_sparsity_coef = small_expert_sparsity_coef
         # Validate the correctness of rotary position embeddings parameters
         if self.rope_scaling is not None and "type" in self.rope_scaling:
 }
 class OlmoeSparseMoeBlock(nn.Module):
     def __init__(self, config, layer_idx: int):
         super().__init__()
         self.layer_idx = layer_idx
         self.num_experts = config.num_experts
+        self.num_small_experts = config.small_expert_count
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
+        self.experts = nn.ModuleList([OlmoeMLP(config) for _ in range(self.num_experts)])
+        self.small_experts = nn.ModuleList([OlmoeMLP(config, is_small=True) for _ in range(self.num_small_experts)])
+        # Gates for both expert types
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
+        self.small_gate = nn.Linear(config.hidden_size, self.num_small_experts, bias=False)
+        self.small_expert_sparsity_coef = config.small_expert_sparsity_coef
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
+        # Get logits for both expert types
         router_logits = self.gate(hidden_states)
+        small_router_logits = self.small_gate(hidden_states)
+        # Combine logits for routing
+        combined_logits = torch.cat([router_logits, small_router_logits], dim=-1)
+        routing_probs = F.softmax(combined_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
         if self.norm_topk_prob:
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        final_hidden_states = torch.zeros_like(hidden_states)
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts,
+            num_classes=self.num_experts + self.num_small_experts
+        ).permute(2, 1, 0)
+        # Process all experts (regular + small)
+        for expert_idx in range(self.num_experts + self.num_small_experts):
             idx, top_x = torch.where(expert_mask[expert_idx])
+            if top_x.shape[0] == 0:
                 continue
+            if expert_idx < self.num_experts:
+                expert = self.experts[expert_idx]
+            else:
+                expert = self.small_experts[expert_idx - self.num_experts]
+            current_states = hidden_states[top_x]
+            current_output = expert(current_states) * routing_weights[top_x, idx, None]
+            final_hidden_states.index_add_(0, top_x, current_output.to(hidden_states.dtype))
+        return final_hidden_states.view(batch_size, sequence_length, hidden_dim), combined_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
         if labels is not None:
             loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
         #
+        total_aux_loss = 0
+        if output_router_logits and outputs.router_logits is not None:
+            # Regular load balancing loss
+            total_aux_loss += load_balancing_loss_func(
+                outputs.router_logits,
+                num_experts=self.config.num_experts + self.config.small_expert_count,
+                top_k=self.config.num_experts_per_tok,
+                attention_mask=attention_mask
+            )
+            # Small expert sparsity loss
+            small_expert_mask = torch.zeros(
+                self.config.num_experts + self.config.small_expert_count,
+                device=outputs.router_logits[0].device
             )
+            small_expert_mask[self.config.num_experts:] = 1.0
+            masked_router_logits = []
+            for logits in outputs.router_logits:
+                # Apply mask to emphasize small experts
+                masked_logits = logits * small_expert_mask * self.config.small_expert_sparsity_coef
+                masked_router_logits.append(masked_logits)
+            total_aux_loss += load_balancing_loss_func(
+                tuple(masked_router_logits),
+                num_experts=self.config.num_experts + self.config.small_expert_count,
+                top_k=self.config.num_experts_per_tok,
+                attention_mask=attention_mask
+            )
+        if loss is not None:
+            loss += self.router_aux_loss_coef * total_aux_loss.to(loss.device)
         #
         return MoeCausalLMOutputWithPast(
             loss=loss,