Crystalcareai
/

GemMoE-Base-Random

@@ -670,16 +670,11 @@ class GemmoeBlockSparseTop2MLP(nn.Module):
         self.act_fn = approx_gelu
     def forward(self, hidden_states):
         current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
-        current_hidden_states = self.w2(current_hidden_states)
         return current_hidden_states
-class GemmoeBlockSparseTop2MLP(GemmoeBlockSparseTop2MLP):
-    def __init__(self, *args, **kwargs):
-        logger.warning_once(
-            "GemmoeBLockSparseTop2MLP is deprecated by GemmoeBlockSparseTop2MLP and will be removed in v4.40."
-        )
-        super().__init__(*args, **kwargs)
 class GemmoeSparseMoeBlock(nn.Module):
     def __init__(self, config):
@@ -699,8 +694,9 @@ class GemmoeSparseMoeBlock(nn.Module):
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
-        router_logits = self.gate(hidden_states)
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
         topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
         topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
@@ -715,7 +711,7 @@ class GemmoeSparseMoeBlock(nn.Module):
         for i in range(self.num_experts):
             expert = self.experts[i]
             expert_output = expert(hidden_states[flat_topk_idx == i])
-            y[flat_topk_idx == i] = expert_output.to(y.dtype)  # Cast expert_output to the same dtype as y
         y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
@@ -983,7 +979,6 @@ class GemmoeModel(GemmoePreTrainedModel):
         self.embed_tokens = value
     @add_start_docstrings_to_model_forward(GEMMOE_INPUTS_DOCSTRING)
-    # Ignore copy
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -994,7 +989,7 @@ class GemmoeModel(GemmoePreTrainedModel):
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        output_router_logits: Optional[bool] = None,  # Add this line
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, MoeModelOutputWithPast]:
@@ -1023,7 +1018,6 @@ class GemmoeModel(GemmoePreTrainedModel):
         # Fix for precision issue when casting to bfloat16
         hidden_size_sqrt = math.sqrt(self.config.hidden_size)
         if inputs_embeds.dtype == torch.bfloat16:
             pass
         hidden_states = inputs_embeds * hidden_size_sqrt
@@ -1110,10 +1104,6 @@ class GemmoeModel(GemmoePreTrainedModel):
             attentions=all_self_attns,
         )
-    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
     def _update_causal_mask(self, attention_mask, input_tensor):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
@@ -1135,7 +1125,7 @@ class GemmoeModel(GemmoePreTrainedModel):
         causal_mask = self.causal_mask[None, None, :, :].to(dtype=dtype, device=device) * min_dtype
         causal_mask = causal_mask.expand(batch_size, 1, -1, -1)
         if attention_mask is not None:
-            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
             if attention_mask.dim() == 2:
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)

         self.act_fn = approx_gelu
     def forward(self, hidden_states):
+        hidden_states = hidden_states.to(torch.float32)  # Cast to float32
         current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
+        current_hidden_states = self.w2(current_hidden_states.to(hidden_states.dtype))  # Cast back to original dtype
         return current_hidden_states
 class GemmoeSparseMoeBlock(nn.Module):
     def __init__(self, config):
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
+        hidden_states_float = hidden_states.float()  # Cast to float32
+        router_logits = self.gate(hidden_states_float)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
         topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
         topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
         for i in range(self.num_experts):
             expert = self.experts[i]
             expert_output = expert(hidden_states[flat_topk_idx == i])
+            y[flat_topk_idx == i] = expert_output
         y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
         self.embed_tokens = value
     @add_start_docstrings_to_model_forward(GEMMOE_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, MoeModelOutputWithPast]:
         # Fix for precision issue when casting to bfloat16
         hidden_size_sqrt = math.sqrt(self.config.hidden_size)
         if inputs_embeds.dtype == torch.bfloat16:
             pass
         hidden_states = inputs_embeds * hidden_size_sqrt
             attentions=all_self_attns,
         )
     def _update_causal_mask(self, attention_mask, input_tensor):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
         causal_mask = self.causal_mask[None, None, :, :].to(dtype=dtype, device=device) * min_dtype
         causal_mask = causal_mask.expand(batch_size, 1, -1, -1)
         if attention_mask is not None:
+            causal_mask = causal_mask.clone()
             if attention_mask.dim() == 2:
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)