Charlie81
/

ThinExperts

Model card Files Files and versions

Charlie81 commited on Jun 9, 2025

Commit

dff49bf

·

1 Parent(s): e868dbf

corrected depth routing

Files changed (1) hide show

myolmoe/modeling_myolmoe.py +9 -2

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -483,11 +483,18 @@ class OlmoeSparseMoeBlock(nn.Module):
             selected_experts = sorted_indices[:, ::self.n_step][:, :self.top_k]
             routing_weights = routing_probs.gather(1, selected_experts)
         elif self.routing_type == "depthconstant":
-            effective_top_k = max(1, self.top_k - (self.layer_idx // 2))
             routing_weights, selected_experts = torch.topk(routing_probs, effective_top_k, dim=-1)
         elif self.routing_type == "depthlatter":
-            effective_top_k = self.top_k if self.layer_idx < 8 else max(1, self.top_k + 8 - self.layer_idx)
             routing_weights, selected_experts = torch.topk(routing_probs, effective_top_k, dim=-1)
         else:
             raise ValueError(f"Unknown routing type: {self.routing_type}")

             selected_experts = sorted_indices[:, ::self.n_step][:, :self.top_k]
             routing_weights = routing_probs.gather(1, selected_experts)
         elif self.routing_type == "depthconstant":
+            # Assumes there are 16 layers
+            slope = (self.top_k - 1) / 15
+            effective_top_k = max(1, round(self.top_k - self.layer_idx * slope))
             routing_weights, selected_experts = torch.topk(routing_probs, effective_top_k, dim=-1)
         elif self.routing_type == "depthlatter":
+            if self.layer_idx < 8:
+                effective_top_k = self.top_k
+            else:
+                slope = (self.top_k - 1) / 7
+                effective_top_k = max(1, round(self.top_k - (self.layer_idx - 8) * slope))
             routing_weights, selected_experts = torch.topk(routing_probs, effective_top_k, dim=-1)
         else:
             raise ValueError(f"Unknown routing type: {self.routing_type}")