Charlie81
/

LoRE

TensorBoard

Safetensors

Model card Files Files and versions

xet

Metrics Training metrics Community

Charlie81 commited on Jul 21, 2025

Commit

a875a53

1 Parent(s): 2b77d15

expert usage stats

Browse files

Files changed (1) hide show

myolmoe/modeling_myolmoe.py +23 -11

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import math
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
@@ -558,20 +559,17 @@ class OlmoeSparseMoeBlock(nn.Module):
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
-        # Determine if this block is in the second half
         in_second_half = layer_idx >= self.total_layers // 2
-        # Determine small expert count for this layer
         if in_second_half:
             second_half_idx = layer_idx - (self.total_layers // 2)
             num_second_half_blocks = self.total_layers - (self.total_layers // 2)
             if config.small_expert_strategy == "constant":
                 self.num_small_experts = config.max_small_expert_count // num_second_half_blocks
             elif config.small_expert_strategy == "increment":
-                # Linearly scale small experts from 1 to max_small_expert_count
                 self.num_small_experts = (
-                    (second_half_idx + 1) * config.max_small_expert_count // ((num_second_half_blocks * (num_second_half_blocks + 1)) // 2)
                 )
             else:
                 raise ValueError(f"Unknown strategy: {config.small_expert_strategy}")
@@ -584,20 +582,19 @@ class OlmoeSparseMoeBlock(nn.Module):
         ]) if self.num_small_experts > 0 else None
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
-        if self.num_small_experts > 0:
-            self.small_gate = nn.Linear(config.hidden_size, self.num_small_experts, bias=False)
-        else:
-            self.small_gate = None
         self.small_expert_sparsity_coef = config.small_expert_sparsity_coef
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
         if self.num_small_experts > 0:
             small_router_logits = self.small_gate(hidden_states)
             combined_logits = torch.cat([router_logits, small_router_logits], dim=-1)
@@ -607,6 +604,12 @@ class OlmoeSparseMoeBlock(nn.Module):
         routing_probs = F.softmax(combined_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
         if self.norm_topk_prob:
             routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
@@ -632,6 +635,15 @@ class OlmoeSparseMoeBlock(nn.Module):
         return final_hidden_states.view(batch_size, sequence_length, hidden_dim), combined_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):

 import math
 from typing import List, Optional, Tuple, Union
+from collections import defaultdict
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
         in_second_half = layer_idx >= self.total_layers // 2
         if in_second_half:
             second_half_idx = layer_idx - (self.total_layers // 2)
             num_second_half_blocks = self.total_layers - (self.total_layers // 2)
             if config.small_expert_strategy == "constant":
                 self.num_small_experts = config.max_small_expert_count // num_second_half_blocks
             elif config.small_expert_strategy == "increment":
                 self.num_small_experts = (
+                    (second_half_idx + 1) * config.max_small_expert_count //
+                    ((num_second_half_blocks * (num_second_half_blocks + 1)) // 2)
                 )
             else:
                 raise ValueError(f"Unknown strategy: {config.small_expert_strategy}")
         ]) if self.num_small_experts > 0 else None
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
+        self.small_gate = nn.Linear(config.hidden_size, self.num_small_experts, bias=False) \
+            if self.num_small_experts > 0 else None
         self.small_expert_sparsity_coef = config.small_expert_sparsity_coef
+        # Usage tracking (not a buffer, no gradient)
+        self.expert_usage = defaultdict(int)
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
         if self.num_small_experts > 0:
             small_router_logits = self.small_gate(hidden_states)
             combined_logits = torch.cat([router_logits, small_router_logits], dim=-1)
         routing_probs = F.softmax(combined_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
+        # Track expert usage
+        for i in range(selected_experts.size(0)):
+            for j in range(self.top_k):
+                expert_id = selected_experts[i, j].item()
+                self.expert_usage[expert_id] += 1
         if self.norm_topk_prob:
             routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
         return final_hidden_states.view(batch_size, sequence_length, hidden_dim), combined_logits
+    def __del__(self):
+        if self.expert_usage:
+            print(f"\n[Expert Usage Report for Layer {self.layer_idx}]")
+            total = sum(self.expert_usage.values())
+            for expert_id in sorted(self.expert_usage):
+                count = self.expert_usage[expert_id]
+                percent = 100.0 * count / total if total > 0 else 0.0
+                print(f"  Expert {expert_id:2d}: {count} times ({percent:.2f}%)")
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):