SequentialLearning
/

SuperLinear

mixture-of-experts

Model card Files Files and versions

razmars commited on Apr 28, 2025

Commit

6846799

·

verified ·

1 Parent(s): adee0fc

Update modeling_super_linear.py

Files changed (1) hide show

modeling_super_linear.py +3 -2

modeling_super_linear.py CHANGED Viewed

@@ -354,6 +354,9 @@ class SparseNoisyMoE(nn.Module):
         output = torch.sum(self.topk_gates.unsqueeze(2) * sparse_expert_outputs, dim=1)
         load_balancing_loss = self.calculate_load_balancing_loss(self.gate_outputs, batch_size)
         if get_prob:
             expert_probs = F.softmax(self.gate_outputs, dim=1)
@@ -509,8 +512,6 @@ class superLinear(nn.Module):
         else:
             out, self.moe_loss = self.moe(x)
-        print(out.shape)
         if self.auto_regressive and self.max_horizon < self.inf_pred_len:
             #print("bitch")

         output = torch.sum(self.topk_gates.unsqueeze(2) * sparse_expert_outputs, dim=1)
         load_balancing_loss = self.calculate_load_balancing_loss(self.gate_outputs, batch_size)
+        expert_probs = F.softmax(self.gate_outputs, dim=1)
+        print(expert_probs.shape)
         if get_prob:
             expert_probs = F.softmax(self.gate_outputs, dim=1)
         else:
             out, self.moe_loss = self.moe(x)
         if self.auto_regressive and self.max_horizon < self.inf_pred_len:
             #print("bitch")