Charlie81
/

ThinExperts

Model card Files Files and versions

Charlie81 commited on Jun 9, 2025

Commit

3aa53b4

·

1 Parent(s): 4d16af6

set to multinomial top k

Files changed (1) hide show

myolmoe/modeling_myolmoe.py +5 -5

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -462,11 +462,11 @@ class OlmoeSparseMoeBlock(nn.Module):
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(
-            routing_weights, self.top_k, dim=-1
-        )
-        # selected_experts = torch.multinomial(routing_weights, self.top_k, replacement=False)
-        # routing_weights = routing_weights.gather(1, selected_experts)
         if self.norm_topk_prob:
             routing_weights /= routing_weights.sum(dim=-1, keepdim=True)

         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        # routing_weights, selected_experts = torch.topk(
+        #     routing_weights, self.top_k, dim=-1
+        # )
+        selected_experts = torch.multinomial(routing_weights, self.top_k, replacement=False)
+        routing_weights = routing_weights.gather(1, selected_experts)
         if self.norm_topk_prob:
             routing_weights /= routing_weights.sum(dim=-1, keepdim=True)