Charlie81
/

ThinExperts

Model card Files Files and versions

Charlie81 commited on Jun 9, 2025

Commit

67ed347

·

1 Parent(s): 4803c83

multinomial8

Files changed (2) hide show

myolmoe/config.json +1 -1
myolmoe/modeling_myolmoe.py +3 -3

myolmoe/config.json CHANGED Viewed

@@ -15,7 +15,7 @@
   "norm_topk_prob": false,
   "num_attention_heads": 16,
   "num_experts": 64,
-  "num_experts_per_tok": 1,
   "num_hidden_layers": 16,
   "num_key_value_heads": 16,
   "output_router_logits": false,

   "norm_topk_prob": false,
   "num_attention_heads": 16,
   "num_experts": 64,
+  "num_experts_per_tok": 8,
   "num_hidden_layers": 16,
   "num_key_value_heads": 16,
   "output_router_logits": false,

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -462,9 +462,9 @@ class OlmoeSparseMoeBlock(nn.Module):
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(
-            routing_weights, self.top_k, dim=-1
-        )
         if self.norm_topk_prob:
             routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
         routing_weights = routing_weights.to(hidden_states.dtype)

         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        selected_experts = torch.multinomial(routing_weights, self.top_k, replacement=False)
+        routing_weights = routing_weights.gather(1, selected_experts)
         if self.norm_topk_prob:
             routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
         routing_weights = routing_weights.to(hidden_states.dtype)