anthonym21
/

Eve-2-MoE-IT-272M

@@ -175,11 +175,10 @@ class SharedMoE(nn.Module):
             mask = flat_indices == i
             batch_idx, rank_idx = torch.where(mask)
-            if batch_idx.numel() > 0:
-                expert_input = flat_x[batch_idx]
-                expert_output = expert(expert_input)
-                weight = flat_weights[batch_idx, rank_idx].unsqueeze(-1)
-                routed_out.view(-1, C).index_add_(0, batch_idx, expert_output * weight)
         return shared_out + routed_out, aux_loss

             mask = flat_indices == i
             batch_idx, rank_idx = torch.where(mask)
+            expert_input = flat_x[batch_idx]
+            expert_output = expert(expert_input)
+            weight = flat_weights[batch_idx, rank_idx].unsqueeze(-1)
+            routed_out.view(-1, C).index_add_(0, batch_idx, expert_output * weight)
         return shared_out + routed_out, aux_loss