Charlie81
/

ThinExperts

Charlie81 commited on Jun 7, 2025

Commit

2daadcc

1 Parent(s): 7bf23fe

match transformers sparse block

Files changed (1) hide show

modeling_myolmoe.py CHANGED Viewed

@@ -319,22 +319,25 @@ class MyOLMoESparseMoeBlock(nn.Module):
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         print(f"DEBUG: MoE forward start - hidden_states shape: {hidden_states.shape}")
-        batch_size, seq_len, _ = hidden_states.shape
         print("absolute precision")
-        hidden_states = hidden_states.view(-1, self.hidden_size)
         # Get routing weights and selected experts
         print(f"DEBUG: 123: {self.router(hidden_states).shape}")
         routing_weights, selected_experts, router_logits = self.router(hidden_states)
-        print(f"DEBUG: MoE forward mid - routing_weights shape: {routing_weights.shape}, selected_experts shape: {selected_experts.shape}")
         if self.norm_topk_prob:
-            routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
         final_hidden_states = torch.zeros(
-            (batch_size * seq_len, self.hidden_size),
-            dtype=hidden_states.dtype,
-            device=hidden_states.device
         )
         # One-hot expert mask

     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         print(f"DEBUG: MoE forward start - hidden_states shape: {hidden_states.shape}")
+        batch_size, seq_len, hidden_dim = hidden_states.shape
         print("absolute precision")
+        hidden_states = hidden_states.view(-1, hidden_dim)
         # Get routing weights and selected experts
         print(f"DEBUG: 123: {self.router(hidden_states).shape}")
         routing_weights, selected_experts, router_logits = self.router(hidden_states)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
         if self.norm_topk_prob:
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        print(f"DEBUG: MoE forward mid - routing_weights shape: {routing_weights.shape}, selected_experts shape: {selected_experts.shape}")
         final_hidden_states = torch.zeros(
+            (batch_size * seq_len, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
         )
         # One-hot expert mask