PLM-Team
/

PLM-1.8B-Base

@@ -366,8 +366,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     return q_embed, k_embed
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Edgellm
 class EdgellmMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -375,15 +373,14 @@ class EdgellmMLP(nn.Module):
         self.intermediate_size = config.intermediate_size
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        def squared_relu(x):
-            return torch.pow(F.relu(x), 2)
-        self.act_fn = squared_relu
     def forward(self, hidden_state):
-        return self.down_proj(self.act_fn(self.up_proj(hidden_state)))
 # Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -398,418 +395,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-# class EdgellmAttention(nn.Module):
-#     """
-#     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-#     and "Generating Long Sequences with Sparse Transformers".
-#     """
-#     def __init__(self, config: EdgellmConfig, layer_idx: Optional[int] = None):
-#         super().__init__()
-#         self.config = config
-#         self.layer_idx = layer_idx
-#         if layer_idx is None:
-#             logger.warning_once(
-#                 f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-#                 "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-#                 "when creating this class."
-#             )
-#         self.hidden_size = config.hidden_size
-#         self.num_heads = config.num_attention_heads
-#         self.head_dim = self.hidden_size // self.num_heads
-#         self.num_key_value_heads = config.num_key_value_heads
-#         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-#         self.max_position_embeddings = config.max_position_embeddings
-#         self.rope_theta = config.rope_theta
-#         self.is_causal = True
-#         self.attention_dropout = config.attention_dropout
-#         if (self.head_dim * self.num_heads) != self.hidden_size:
-#             raise ValueError(
-#                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-#                 f" and `num_heads`: {self.num_heads})."
-#             )
-#         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-#         self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-#         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-#         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-#         self.rotary_emb = EdgellmRotaryEmbedding(
-#             self.head_dim,
-#             max_position_embeddings=self.max_position_embeddings,
-#             base=self.rope_theta,
-#         )
-#     def forward(
-#         self,
-#         hidden_states: torch.Tensor,
-#         attention_mask: Optional[torch.Tensor] = None,
-#         position_ids: Optional[torch.LongTensor] = None,
-#         past_key_value: Optional[Cache] = None,
-#         output_attentions: bool = False,
-#         use_cache: bool = False,
-#         cache_position: Optional[torch.LongTensor] = None,
-#     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-#         bsz, q_len, _ = hidden_states.size()
-#         query_states = self.q_proj(hidden_states)
-#         key_states = self.k_proj(hidden_states)
-#         value_states = self.v_proj(hidden_states)
-#         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-#         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-#         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-#         kv_seq_len = key_states.shape[-2]
-#         if past_key_value is not None:
-#             if self.layer_idx is None:
-#                 raise ValueError(
-#                     f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-#                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-#                     "with a layer index."
-#                 )
-#             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-#         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-#         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-#         if past_key_value is not None:
-#             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
-#             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-#         # repeat k/v heads if n_kv_heads < n_heads
-#         key_states = repeat_kv(key_states, self.num_key_value_groups)
-#         value_states = repeat_kv(value_states, self.num_key_value_groups)
-#         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-#         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-#             raise ValueError(
-#                 f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-#                 f" {attn_weights.size()}"
-#             )
-#         if attention_mask is not None:  # no matter the length, we just slice it
-#             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-#             attn_weights = attn_weights + causal_mask
-#         # upcast attention to fp32
-#         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-#         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-#         attn_output = torch.matmul(attn_weights, value_states)
-#         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-#             raise ValueError(
-#                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-#                 f" {attn_output.size()}"
-#             )
-#         attn_output = attn_output.transpose(1, 2).contiguous()
-#         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-#         attn_output = self.o_proj(attn_output)
-#         if not output_attentions:
-#             attn_weights = None
-#         return attn_output, attn_weights, past_key_value
-# class EdgellmFlashAttention2(EdgellmAttention):
-#     """
-#     Edgellm flash attention module, following Edgellm attention module. This module inherits from `EdgellmAttention`
-#     as the weights of the module stays untouched. The only required change would be on the forward pass
-#     where it needs to correctly call the public API of flash attention and deal with padding tokens
-#     in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-#     config.max_window_layers layers.
-#     """
-#     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-#     def __init__(self, *args, **kwargs):
-#         super().__init__(*args, **kwargs)
-#         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-#         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-#         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-#         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-#     def forward(
-#         self,
-#         hidden_states: torch.Tensor,
-#         attention_mask: Optional[torch.Tensor] = None,
-#         position_ids: Optional[torch.LongTensor] = None,
-#         past_key_value: Optional[Cache] = None,
-#         output_attentions: bool = False,
-#         use_cache: bool = False,
-#         cache_position: Optional[torch.LongTensor] = None,
-#     ):
-#         bsz, q_len, _ = hidden_states.size()
-#         query_states = self.q_proj(hidden_states)
-#         key_states = self.k_proj(hidden_states)
-#         value_states = self.v_proj(hidden_states)
-#         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-#         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-#         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-#         kv_seq_len = key_states.shape[-2]
-#         if past_key_value is not None:
-#             if self.layer_idx is None:
-#                 raise ValueError(
-#                     f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-#                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-#                     "with a layer index."
-#                 )
-#             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-#         # Because the input can be padded, the absolute sequence length depends on the max position id.
-#         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-#         cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-#         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-#         use_sliding_windows = (
-#             _flash_supports_window_size
-#             and getattr(self.config, "sliding_window", None) is not None
-#             and kv_seq_len > self.config.sliding_window
-#             and self.config.use_sliding_window
-#         )
-#         if not _flash_supports_window_size:
-#             logger.warning_once(
-#                 "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-#                 " make sure to upgrade flash-attn library."
-#             )
-#         if past_key_value is not None:
-#             # Activate slicing cache only if the config has a value `sliding_windows` attribute
-#             cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-#             if (
-#                 getattr(self.config, "sliding_window", None) is not None
-#                 and kv_seq_len > self.config.sliding_window
-#                 and cache_has_contents
-#             ):
-#                 slicing_tokens = 1 - self.config.sliding_window
-#                 past_key = past_key_value[self.layer_idx][0]
-#                 past_value = past_key_value[self.layer_idx][1]
-#                 past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-#                 past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-#                 if past_key.shape[-2] != self.config.sliding_window - 1:
-#                     raise ValueError(
-#                         f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-#                         f" {past_key.shape}"
-#                     )
-#                 if attention_mask is not None:
-#                     attention_mask = attention_mask[:, slicing_tokens:]
-#                     attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-#             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
-#             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-#         # repeat k/v heads if n_kv_heads < n_heads
-#         key_states = repeat_kv(key_states, self.num_key_value_groups)
-#         value_states = repeat_kv(value_states, self.num_key_value_groups)
-#         dropout_rate = 0.0 if not self.training else self.attention_dropout
-#         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-#         # therefore the input hidden states gets silently casted in float32. Hence, we need
-#         # cast them back in float16 just to be sure everything works as expected.
-#         input_dtype = query_states.dtype
-#         if input_dtype == torch.float32:
-#             if torch.is_autocast_enabled():
-#                 target_dtype = torch.get_autocast_gpu_dtype()
-#             # Handle the case where the model is quantized
-#             elif hasattr(self.config, "_pre_quantization_dtype"):
-#                 target_dtype = self.config._pre_quantization_dtype
-#             else:
-#                 target_dtype = self.q_proj.weight.dtype
-#             logger.warning_once(
-#                 f"The input hidden states seems to be silently casted in float32, this might be related to"
-#                 f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-#                 f" {target_dtype}."
-#             )
-#             query_states = query_states.to(target_dtype)
-#             key_states = key_states.to(target_dtype)
-#             value_states = value_states.to(target_dtype)
-#         # Reashape to the expected shape for Flash Attention
-#         query_states = query_states.transpose(1, 2)
-#         key_states = key_states.transpose(1, 2)
-#         value_states = value_states.transpose(1, 2)
-#         attn_output = self._flash_attention_forward(
-#             query_states,
-#             key_states,
-#             value_states,
-#             attention_mask,
-#             q_len,
-#             dropout=dropout_rate,
-#             use_sliding_windows=use_sliding_windows,
-#         )
-#         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-#         attn_output = self.o_proj(attn_output)
-#         if not output_attentions:
-#             attn_weights = None
-#         return attn_output, attn_weights, past_key_value
-#     def _flash_attention_forward(
-#         self,
-#         query_states,
-#         key_states,
-#         value_states,
-#         attention_mask,
-#         query_length,
-#         dropout=0.0,
-#         softmax_scale=None,
-#         use_sliding_windows=False,
-#     ):
-#         """
-#         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-#         first unpad the input, then computes the attention scores and pad the final attention scores.
-#         Args:
-#             query_states (`torch.Tensor`):
-#                 Input query states to be passed to Flash Attention API
-#             key_states (`torch.Tensor`):
-#                 Input key states to be passed to Flash Attention API
-#             value_states (`torch.Tensor`):
-#                 Input value states to be passed to Flash Attention API
-#             attention_mask (`torch.Tensor`):
-#                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-#                 position of padding tokens and 1 for the position of non-padding tokens.
-#             dropout (`float`):
-#                 Attention dropout
-#             softmax_scale (`float`, *optional*):
-#                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-#             use_sliding_windows (`bool`, *optional*):
-#                 Whether to activate sliding window attention.
-#         """
-#         if not self._flash_attn_uses_top_left_mask:
-#             causal = self.is_causal
-#         else:
-#             # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-#             causal = self.is_causal and query_length != 1
-#         # Decide whether to use SWA or not by layer index.
-#         if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-#             use_sliding_windows = False
-#         # Contains at least one padding token in the sequence
-#         if attention_mask is not None:
-#             batch_size = query_states.shape[0]
-#             query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-#                 query_states, key_states, value_states, attention_mask, query_length
-#             )
-#             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-#             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-#             if not use_sliding_windows:
-#                 attn_output_unpad = flash_attn_varlen_func(
-#                     query_states,
-#                     key_states,
-#                     value_states,
-#                     cu_seqlens_q=cu_seqlens_q,
-#                     cu_seqlens_k=cu_seqlens_k,
-#                     max_seqlen_q=max_seqlen_in_batch_q,
-#                     max_seqlen_k=max_seqlen_in_batch_k,
-#                     dropout_p=dropout,
-#                     softmax_scale=softmax_scale,
-#                     causal=causal,
-#                 )
-#             else:
-#                 attn_output_unpad = flash_attn_varlen_func(
-#                     query_states,
-#                     key_states,
-#                     value_states,
-#                     cu_seqlens_q=cu_seqlens_q,
-#                     cu_seqlens_k=cu_seqlens_k,
-#                     max_seqlen_q=max_seqlen_in_batch_q,
-#                     max_seqlen_k=max_seqlen_in_batch_k,
-#                     dropout_p=dropout,
-#                     softmax_scale=softmax_scale,
-#                     causal=causal,
-#                     window_size=(self.config.sliding_window, self.config.sliding_window),
-#                 )
-#             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-#         else:
-#             if not use_sliding_windows:
-#                 attn_output = flash_attn_func(
-#                     query_states,
-#                     key_states,
-#                     value_states,
-#                     dropout,
-#                     softmax_scale=softmax_scale,
-#                     causal=causal,
-#                 )
-#             else:
-#                 attn_output = flash_attn_func(
-#                     query_states,
-#                     key_states,
-#                     value_states,
-#                     dropout,
-#                     softmax_scale=softmax_scale,
-#                     causal=causal,
-#                     window_size=(self.config.sliding_window, self.config.sliding_window),
-#                 )
-#         return attn_output
-#     # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-#     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-#         batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-#         # On the first iteration we need to properly re-create the padding mask
-#         # by slicing it on the proper place
-#         if kv_seq_len != attention_mask.shape[-1]:
-#             attention_mask_num_tokens = attention_mask.shape[-1]
-#             attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-#         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-#         key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-#         value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-#         if query_length == kv_seq_len:
-#             query_layer = index_first_axis(
-#                 query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-#             )
-#             cu_seqlens_q = cu_seqlens_k
-#             max_seqlen_in_batch_q = max_seqlen_in_batch_k
-#             indices_q = indices_k
-#         elif query_length == 1:
-#             max_seqlen_in_batch_q = 1
-#             cu_seqlens_q = torch.arange(
-#                 batch_size + 1, dtype=torch.int32, device=query_layer.device
-#             )  # There is a memcpy here, that is very bad.
-#             indices_q = cu_seqlens_q[:-1]
-#             query_layer = query_layer.squeeze(1)
-#         else:
-#             # The -q_len: slice assumes left padding.
-#             attention_mask = attention_mask[:, -query_length:]
-#             query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-#         return (
-#             query_layer,
-#             key_layer,
-#             value_layer,
-#             indices_q,
-#             (cu_seqlens_q, cu_seqlens_k),
-#             (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-#         )
 # Copied from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py
 # DeepseekV2Attention with DeepseekV2->Edgellm
@@ -1036,522 +621,7 @@ class EdgellmAttention(nn.Module):
             attn_weights = None
         return attn_output, attn_weights, past_key_value
-# class EdgellmAttention(nn.Module):
-#     """Multi-headed attention from 'Attention Is All You Need' paper"""
-#     def __init__(self, config: EdgellmConfig, layer_idx: Optional[int] = None):
-#         super().__init__()
-#         self.config = config
-#         self.layer_idx = layer_idx
-#         if layer_idx is None:
-#             logger.warning_once(
-#                 f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-#                 "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-#                 "when creating this class."
-#             )
-#         self.attention_dropout = config.attention_dropout
-#         self.hidden_size = config.hidden_size
-#         self.num_heads = config.num_attention_heads
-#         self.max_position_embeddings = config.max_position_embeddings
-#         self.rope_theta = config.rope_theta
-#         self.q_lora_rank = config.q_lora_rank
-#         self.qk_rope_head_dim = config.qk_rope_head_dim
-#         self.kv_lora_rank = config.kv_lora_rank
-#         self.v_head_dim = config.v_head_dim
-#         self.qk_nope_head_dim = config.qk_nope_head_dim
-#         self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
-#         self.is_causal = True
-#         if self.q_lora_rank is None:
-#             self.q_proj = nn.Linear(
-#                 self.hidden_size, self.num_heads * self.q_head_dim, bias=False
-#             )
-#         else:
-#             self.q_a_proj = nn.Linear(
-#                 self.hidden_size, config.q_lora_rank, bias=config.attention_bias
-#             )
-#             self.q_a_layernorm = EdgellmRMSNorm(config.q_lora_rank)
-#             self.q_b_proj = nn.Linear(
-#                 config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
-#             )
-#         self.kv_a_proj_with_mqa = nn.Linear(
-#             self.hidden_size,
-#             config.kv_lora_rank + config.qk_rope_head_dim,
-#             bias=config.attention_bias,
-#         )
-#         self.kv_a_layernorm = EdgellmRMSNorm(config.kv_lora_rank)
-#         self.kv_b_proj = nn.Linear(
-#             config.kv_lora_rank,
-#             self.num_heads
-#             * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
-#             bias=False,
-#         )
-#         self.o_proj = nn.Linear(
-#             self.num_heads * self.v_head_dim,
-#             self.hidden_size,
-#             bias=config.attention_bias,
-#         )
-#         self._init_rope()
-#         self.softmax_scale = self.q_head_dim ** (-0.5)
-#         if self.config.rope_scaling is not None:
-#             mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
-#             scaling_factor = self.config.rope_scaling["factor"]
-#             if mscale_all_dim:
-#                 mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
-#                 self.softmax_scale = self.softmax_scale * mscale * mscale
-#     def _init_rope(self):
-#         if self.config.rope_scaling is None:
-#             self.rotary_emb = EdgellmRotaryEmbedding(
-#                 self.qk_rope_head_dim,
-#                 max_position_embeddings=self.max_position_embeddings,
-#                 base=self.rope_theta,
-#             )
-#         else:
-#             scaling_type = self.config.rope_scaling["type"]
-#             scaling_factor = self.config.rope_scaling["factor"]
-#             if scaling_type == "linear":
-#                 self.rotary_emb = EdgellmLinearScalingRotaryEmbedding(
-#                     self.qk_rope_head_dim,
-#                     max_position_embeddings=self.max_position_embeddings,
-#                     scaling_factor=scaling_factor,
-#                     base=self.rope_theta,
-#                 )
-#             elif scaling_type == "dynamic":
-#                 self.rotary_emb = EdgellmDynamicNTKScalingRotaryEmbedding(
-#                     self.qk_rope_head_dim,
-#                     max_position_embeddings=self.max_position_embeddings,
-#                     scaling_factor=scaling_factor,
-#                     base=self.rope_theta,
-#                 )
-#             elif scaling_type == "yarn":
-#                 kwargs = {
-#                     key: self.config.rope_scaling[key]
-#                     for key in [
-#                         "original_max_position_embeddings",
-#                         "beta_fast",
-#                         "beta_slow",
-#                         "mscale",
-#                         "mscale_all_dim",
-#                     ]
-#                     if key in self.config.rope_scaling
-#                 }
-#                 self.rotary_emb = EdgellmYarnRotaryEmbedding(
-#                     self.qk_rope_head_dim,
-#                     max_position_embeddings=self.max_position_embeddings,
-#                     scaling_factor=scaling_factor,
-#                     base=self.rope_theta,
-#                     **kwargs,
-#                 )
-#             else:
-#                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-#     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-#         return (
-#             tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim)
-#             .transpose(1, 2)
-#             .contiguous()
-#         )
-#     def forward(
-#         self,
-#         hidden_states: torch.Tensor,
-#         attention_mask: Optional[torch.Tensor] = None,
-#         position_ids: Optional[torch.LongTensor] = None,
-#         past_key_value: Optional[Cache] = None,
-#         output_attentions: bool = False,
-#         use_cache: bool = False,
-#         **kwargs,
-#     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-#         if "padding_mask" in kwargs:
-#             warnings.warn(
-#                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-#             )
-#         torch.save(hidden_states, "hf-hidden_states.pt")
-#         bsz, q_len, _ = hidden_states.size()
-#         if self.q_lora_rank is None:
-#             q = self.q_proj(hidden_states)
-#         else:
-#             q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-#         q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
-#         q_nope, q_pe = torch.split(
-#             q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
-#         )
-#         compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-#         compressed_kv, k_pe = torch.split(
-#             compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
-#         )
-#         k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
-#         kv = (
-#             self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-#             .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-#             .transpose(1, 2)
-#         )
-#         k_nope, value_states = torch.split(
-#             kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
-#         )
-#         kv_seq_len = value_states.shape[-2]
-#         if past_key_value is not None:
-#             if self.layer_idx is None:
-#                 raise ValueError(
-#                     f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-#                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-#                     "with a layer index."
-#                 )
-#             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-#         # torch.save(value_states, "./hf_value_states_rope.pt")
-#         # print(kv_seq_len)
-#         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-#         # torch.save(q_pe, "./hf_q_pe_1.pt")
-#         # torch.save(cos, "./hf-cos.pt")
-#         # torch.save(cos, "./hf-sin.pt")
-#         q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
-#         # torch.save(q_pe, "./hf_q_pe_2.pt")
-#         query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-#         query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
-#         query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
-#         key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-#         key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
-#         key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
-#         if past_key_value is not None:
-#             cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-#             key_states, value_states = past_key_value.update(
-#                 key_states, value_states, self.layer_idx, cache_kwargs
-#             )
-#         # torch.save(query_states, "./hf-q.pt")
-#         # torch.save(key_states, "./hf-k.pt")
-#         # torch.save(value_states, "./hf-v.pt")
-#         # breakpoint()
-#         attn_weights = (
-#             torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
-#         )
-#         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-#             raise ValueError(
-#                 f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-#                 f" {attn_weights.size()}"
-#             )
-#         assert attention_mask is not None
-#         if attention_mask is not None:
-#             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-#                 raise ValueError(
-#                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-#                 )
-#             attn_weights = attn_weights + attention_mask
-#         # upcast attention to fp32
-#         attn_weights = nn.functional.softmax(
-#             attn_weights, dim=-1, dtype=torch.float32
-#         ).to(query_states.dtype)
-#         attn_weights = nn.functional.dropout(
-#             attn_weights, p=self.attention_dropout, training=self.training
-#         )
-#         attn_output = torch.matmul(attn_weights, value_states)
-#         if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
-#             raise ValueError(
-#                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
-#                 f" {attn_output.size()}"
-#             )
-#         attn_output = attn_output.transpose(1, 2).contiguous()
-#         attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
-#         attn_output = self.o_proj(attn_output)
-#         if not output_attentions:
-#             attn_weights = None
-#         return attn_output, attn_weights, past_key_value
-# Copied from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py
-# DeepseekV2Attention with DeepseekV2->Edgellm
-# class EdgellmFlashAttention2(EdgellmAttention):
-#     """
-#     Edgellm flash attention module. This module inherits from `EdgellmAttention` as the weights of the module stays
-#     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-#     flash attention and deal with padding tokens in case the input contains any of them.
-#     """
-#     def __init__(self, *args, **kwargs):
-#         super().__init__(*args, **kwargs)
-#         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-#         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-#         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-#         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-#     def forward(
-#         self,
-#         hidden_states: torch.Tensor,
-#         attention_mask: Optional[torch.LongTensor] = None,
-#         position_ids: Optional[torch.LongTensor] = None,
-#         past_key_value: Optional[Cache] = None,
-#         output_attentions: bool = False,
-#         use_cache: bool = False,
-#         **kwargs,
-#     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-#         # EdgellmFlashAttention2 attention does not support output_attentions
-#         if "padding_mask" in kwargs:
-#             warnings.warn(
-#                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-#             )
-#             # overwrite attention_mask with padding_mask
-#             attention_mask = kwargs.pop("padding_mask")
-#         output_attentions = False
-#         bsz, q_len, _ = hidden_states.size()
-#         if self.q_lora_rank is None:
-#             q = self.q_proj(hidden_states)
-#         else:
-#             q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-#         q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
-#         q_nope, q_pe = torch.split(
-#             q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
-#         )
-#         # Flash attention requires the input to have the shape
-#         # batch_size x seq_length x head_dim x hidden_dim
-#         # therefore we just need to keep the original shape
-#         compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-#         compressed_kv, k_pe = torch.split(
-#             compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
-#         )
-#         k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
-#         kv = (
-#             self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-#             .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-#             .transpose(1, 2)
-#         )
-#         k_nope, value_states = torch.split(
-#             kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
-#         )
-#         kv_seq_len = value_states.shape[-2]
-#         kv_seq_len = value_states.shape[-2]
-#         if past_key_value is not None:
-#             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-#         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-#         q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
-#         query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-#         query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
-#         query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
-#         key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-#         key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
-#         key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
-#         if self.q_head_dim != self.v_head_dim:
-#             value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])
-#         if past_key_value is not None:
-#             cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-#             key_states, value_states = past_key_value.update(
-#                 key_states, value_states, self.layer_idx, cache_kwargs
-#             )
-#         # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-#         # to be able to avoid many of these transpose/reshape/view.
-#         query_states = query_states.transpose(1, 2)
-#         key_states = key_states.transpose(1, 2)
-#         value_states = value_states.transpose(1, 2)
-#         dropout_rate = self.attention_dropout if self.training else 0.0
-#         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-#         # therefore the input hidden states gets silently casted in float32. Hence, we need
-#         # cast them back in the correct dtype just to be sure everything works as expected.
-#         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-#         # in fp32. (EdgellmRMSNorm handles it correctly)
-#         input_dtype = query_states.dtype
-#         if input_dtype == torch.float32:
-#             # Handle the case where the model is quantized
-#             if hasattr(self.config, "_pre_quantization_dtype"):
-#                 target_dtype = self.config._pre_quantization_dtype
-#             elif torch.is_autocast_enabled():
-#                 target_dtype = torch.get_autocast_gpu_dtype()
-#             else:
-#                 target_dtype = (
-#                     self.q_proj.weight.dtype
-#                     if self.q_lora_rank is None
-#                     else self.q_a_proj.weight.dtype
-#                 )
-#             logger.warning_once(
-#                 f"The input hidden states seems to be silently casted in float32, this might be related to"
-#                 f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-#                 f" {target_dtype}."
-#             )
-#             query_states = query_states.to(target_dtype)
-#             key_states = key_states.to(target_dtype)
-#             value_states = value_states.to(target_dtype)
-#         attn_output = self._flash_attention_forward(
-#             query_states,
-#             key_states,
-#             value_states,
-#             attention_mask,
-#             q_len,
-#             dropout=dropout_rate,
-#             softmax_scale=self.softmax_scale,
-#         )
-#         if self.q_head_dim != self.v_head_dim:
-#             attn_output = attn_output[:, :, :, : self.v_head_dim]
-#         attn_output = attn_output.reshape(
-#             bsz, q_len, self.num_heads * self.v_head_dim
-#         ).contiguous()
-#         attn_output = self.o_proj(attn_output)
-#         if not output_attentions:
-#             attn_weights = None
-#         return attn_output, attn_weights, past_key_value
-#     def _flash_attention_forward(
-#         self,
-#         query_states,
-#         key_states,
-#         value_states,
-#         attention_mask,
-#         query_length,
-#         dropout=0.0,
-#         softmax_scale=None,
-#     ):
-#         """
-#         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-#         first unpad the input, then computes the attention scores and pad the final attention scores.
-#         Args:
-#             query_states (`torch.Tensor`):
-#                 Input query states to be passed to Flash Attention API
-#             key_states (`torch.Tensor`):
-#                 Input key states to be passed to Flash Attention API
-#             value_states (`torch.Tensor`):
-#                 Input value states to be passed to Flash Attention API
-#             attention_mask (`torch.Tensor`):
-#                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-#                 position of padding tokens and 1 for the position of non-padding tokens.
-#             dropout (`int`, *optional*):
-#                 Attention dropout
-#             softmax_scale (`float`, *optional*):
-#                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-#         """
-#         if not self._flash_attn_uses_top_left_mask:
-#             causal = self.is_causal
-#         else:
-#             # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in EdgellmFlashAttention2 __init__.
-#             causal = self.is_causal and query_length != 1
-#         # Contains at least one padding token in the sequence
-#         if attention_mask is not None:
-#             batch_size = query_states.shape[0]
-#             (
-#                 query_states,
-#                 key_states,
-#                 value_states,
-#                 indices_q,
-#                 cu_seq_lens,
-#                 max_seq_lens,
-#             ) = self._upad_input(
-#                 query_states, key_states, value_states, attention_mask, query_length
-#             )
-#             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-#             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-#             attn_output_unpad = flash_attn_varlen_func(
-#                 query_states,
-#                 key_states,
-#                 value_states,
-#                 cu_seqlens_q=cu_seqlens_q,
-#                 cu_seqlens_k=cu_seqlens_k,
-#                 max_seqlen_q=max_seqlen_in_batch_q,
-#                 max_seqlen_k=max_seqlen_in_batch_k,
-#                 dropout_p=dropout,
-#                 softmax_scale=softmax_scale,
-#                 causal=causal,
-#             )
-#             attn_output = pad_input(
-#                 attn_output_unpad, indices_q, batch_size, query_length
-#             )
-#         else:
-#             attn_output = flash_attn_func(
-#                 query_states,
-#                 key_states,
-#                 value_states,
-#                 dropout,
-#                 softmax_scale=softmax_scale,
-#                 causal=causal,
-#             )
-#         return attn_output
-#     def _upad_input(
-#         self, query_layer, key_layer, value_layer, attention_mask, query_length
-#     ):
-#         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-#         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-#         key_layer = index_first_axis(
-#             key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
-#             indices_k,
-#         )
-#         value_layer = index_first_axis(
-#             value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
-#             indices_k,
-#         )
-#         if query_length == kv_seq_len:
-#             query_layer = index_first_axis(
-#                 query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
-#                 indices_k,
-#             )
-#             cu_seqlens_q = cu_seqlens_k
-#             max_seqlen_in_batch_q = max_seqlen_in_batch_k
-#             indices_q = indices_k
-#         elif query_length == 1:
-#             max_seqlen_in_batch_q = 1
-#             cu_seqlens_q = torch.arange(
-#                 batch_size + 1, dtype=torch.int32, device=query_layer.device
-#             )  # There is a memcpy here, that is very bad.
-#             indices_q = cu_seqlens_q[:-1]
-#             query_layer = query_layer.squeeze(1)
-#         else:
-#             # The -q_len: slice assumes left padding.
-#             attention_mask = attention_mask[:, -query_length:]
-#             query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
-#                 query_layer, attention_mask
-#             )
-#         return (
-#             query_layer,
-#             key_layer,
-#             value_layer,
-#             indices_q,
-#             (cu_seqlens_q, cu_seqlens_k),
-#             (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-#         )
 class EdgellmFlashAttention2(EdgellmAttention):
     """

     return q_embed, k_embed
 class EdgellmMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.intermediate_size = config.intermediate_size
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_state):
+        h = self.up_proj(hidden_state)
+        h = self.act_fn(h)
+        h = self.down_proj(h)
+        return h
 # Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 # Copied from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py
 # DeepseekV2Attention with DeepseekV2->Edgellm
             attn_weights = None
         return attn_output, attn_weights, past_key_value
 class EdgellmFlashAttention2(EdgellmAttention):
     """