Tiiny
/

TurboSparse-Mixtral

Feature Extraction

turbosparsemixtral

Model card Files Files and versions

syx commited on May 31, 2024

Commit

8f55951

·

1 Parent(s): d7c6bda

minor

Files changed (2) hide show

config.json +1 -1
modeling_supersparsemixtral.py +1 -1

config.json CHANGED Viewed

@@ -15,7 +15,7 @@
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
-  "model_type": "mixtral",
   "num_attention_heads": 32,
   "num_experts_per_tok": 2,
   "num_hidden_layers": 32,

   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
+  "model_type": "supersparsemixtral",
   "num_attention_heads": 32,
   "num_experts_per_tok": 2,
   "num_hidden_layers": 32,

modeling_supersparsemixtral.py CHANGED Viewed

@@ -1280,7 +1280,7 @@ class SuperSparseMixtralBlockSparseTop2MLP(nn.Module):
     def forward(self, hidden_states):
         mask = self.predictor(hidden_states)
-        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
         hard_mask = torch.round(mask)
         mask = mask + (hard_mask - mask).detach()
         current_hidden_states = torch.mul(current_hidden_states, mask)

     def forward(self, hidden_states):
         mask = self.predictor(hidden_states)
+        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.act_fn(self.w3(hidden_states))
         hard_mask = torch.round(mask)
         mask = mask + (hard_mask - mask).detach()
         current_hidden_states = torch.mul(current_hidden_states, mask)