Charlie81
/

ThinExperts

Safetensors

Model card Files Files and versions

xet

Community

Charlie81 commited on Jun 7, 2025

Commit

170c7d7

1 Parent(s): be9d959

refactor sparse

Browse files

Files changed (1) hide show

modeling_myolmoe.py +11 -12

modeling_myolmoe.py CHANGED Viewed

@@ -223,6 +223,7 @@ class MyOLMoERouting(nn.Module):
         self.hidden_size = config.hidden_size
         self.routing_type = getattr(config, "routing_type", "sparse")
         self.router_temperature = getattr(config, "router_temperature", 1.0)
         # Shared components
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
@@ -230,20 +231,13 @@ class MyOLMoERouting(nn.Module):
         # For non-deterministic routing
         self.gumbel_noise = getattr(config, "gumbel_noise", 0.1)
-    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
-        # Always use softmax, even for "dense" routing
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
-        if self.norm_topk_prob:
-            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        routing_weights = routing_weights.to(hidden_states.dtype)
         if self.routing_type == "dense":
             # Dense routing - use all experts equally
             routing_weights = torch.ones_like(router_logits) / self.num_experts
@@ -262,11 +256,16 @@ class MyOLMoERouting(nn.Module):
         else:  # Default sparse routing
             # Standard sparse top-k routing
-            routing_weights = F.softmax(router_logits, dim=-1)
             routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
         return routing_weights, selected_experts, router_logits
 class OlmoeRotaryEmbedding(nn.Module):
     def __init__(self, config: OlmoeConfig, device=None):
         super().__init__()

         self.hidden_size = config.hidden_size
         self.routing_type = getattr(config, "routing_type", "sparse")
         self.router_temperature = getattr(config, "router_temperature", 1.0)
+        self.norm_topk_prob = getattr(config, "norm_topk_prob", False)
         # Shared components
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
         # For non-deterministic routing
         self.gumbel_noise = getattr(config, "gumbel_noise", 0.1)
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
+        print("TEST testtest123")
         hidden_states = hidden_states.view(-1, hidden_dim)
+        print("TEST 4564645testtest123")
         router_logits = self.gate(hidden_states)
         if self.routing_type == "dense":
             # Dense routing - use all experts equally
             routing_weights = torch.ones_like(router_logits) / self.num_experts
         else:  # Default sparse routing
             # Standard sparse top-k routing
+            routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float)
             routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        if self.norm_topk_prob:
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states.dtype)
         return routing_weights, selected_experts, router_logits
 class OlmoeRotaryEmbedding(nn.Module):
     def __init__(self, config: OlmoeConfig, device=None):
         super().__init__()