DiscoResearch
/

mixtral-7b-8expert

Text Generation

text-generation-inference

Model card Files Files and versions

bjoernp commited on Dec 10, 2023

Commit

9b9979d

·

1 Parent(s): f93990d

Update modeling_moe_mistral.py

Files changed (1) hide show

modeling_moe_mistral.py +5 -6

modeling_moe_mistral.py CHANGED Viewed

@@ -215,17 +215,16 @@ class MoE(nn.Module):
         orig_shape = x.shape
         x = x.view(-1, x.shape[-1])
-        scores = self.gate(x)
         expert_weights, expert_indices = torch.topk(scores, self.num_experts_per_token, dim=-1)
-        expert_weights = expert_weights.softmax(dim=-1)
         flat_expert_indices = expert_indices.view(-1)
         x = x.repeat_interleave(self.num_experts_per_token, dim=0)
-        y = torch.empty_like(x)
         for i, expert in enumerate(self.experts):
-            y[flat_expert_indices == i] = expert(x[flat_expert_indices == i])
-        y = (y.view(*expert_weights.shape, -1) * expert_weights.unsqueeze(-1)).sum(dim=1)
-        return y.view(*orig_shape)
 # Copied from transformers.models.llama.modeling_llama.repeat_kv

         orig_shape = x.shape
         x = x.view(-1, x.shape[-1])
+        scores = self.gate(x).softmax(dim=-1)
         expert_weights, expert_indices = torch.topk(scores, self.num_experts_per_token, dim=-1)
         flat_expert_indices = expert_indices.view(-1)
         x = x.repeat_interleave(self.num_experts_per_token, dim=0)
+        x = torch.empty_like(x)
         for i, expert in enumerate(self.experts):
+            x[flat_expert_indices == i] = expert(x[flat_expert_indices == i])
+        x = (x.view(*expert_weights.shape, -1) * expert_weights.unsqueeze(-1)).sum(dim=1)
+        return x.view(*orig_shape)
 # Copied from transformers.models.llama.modeling_llama.repeat_kv