damerajee
/

Llamoe-test

@@ -33,10 +33,9 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
-from .configuration_Llamoe import LlamoeConfig
 from math import sqrt as math_sqrt
-_CONFIG_FOR_DOC = "LlamoeConfig"
 if is_flash_attn_2_available():
@@ -53,8 +52,10 @@ if is_torch_fx_available():
     _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
-def approx_gelu(x):
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3)))
 def load_balancing_loss_func(
     gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
@@ -130,36 +131,40 @@ def load_balancing_loss_func(
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
     return (
         indices,
         cu_seqlens,
         max_seqlen_in_batch,
     )
 class LlamoeRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
         super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-ALL_LAYERNORM_LAYERS.append(LlamoeRMSNorm)
 class LlamoeRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
@@ -205,63 +210,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
-class LlamoeBlockSparseTop2MLP(nn.Module):
-    def __init__(self, config: LlamoeConfig):
-        super().__init__()
-        self.ffn_dim = config.intermediate_size
-        self.hidden_dim = config.hidden_size
-        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
-        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
-        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
-        self.act_fn = approx_gelu
-    def forward(self, hidden_states):
-        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
-        current_hidden_states = self.w2(current_hidden_states)
-        return current_hidden_states.to(hidden_states.dtype)
-class LlamoeSparseMoeBlock(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.hidden_dim = config.hidden_size
-        self.ffn_dim = config.intermediate_size
-        self.num_experts = config.num_local_experts
-        self.top_k = 2
-        # gating
-        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
-        self.experts = nn.ModuleList([LlamoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
-    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        batch_size, sequence_length, hidden_dim = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_dim)
-        # router_logits: (batch * sequence_length, n_experts)
-        router_logits = self.gate(hidden_states)
-        routing_weights = F.softmax(router_logits, dim=1)
-        topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
-        topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
-        hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
-        y = torch.empty_like(hidden_states)
-        flat_topk_idx = topk_idx.view(-1)
-        for i in range(self.num_experts):
-            expert = self.experts[i]
-            expert_output = expert(hidden_states[flat_topk_idx == i])
-            y[flat_topk_idx == i] = expert_output
-        y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
-        final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
-        return final_hidden_states.to(hidden_states.dtype), router_logits.to(hidden_states.dtype)
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
@@ -273,10 +223,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class LlamoeAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
     def __init__(self, config: LlamoeConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
@@ -291,14 +241,14 @@ class LlamoeAttention(nn.Module):
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
         self.num_key_value_heads = config.num_key_value_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
         self.is_causal = True
-        if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                 f" and `num_heads`: {self.num_heads})."
@@ -307,19 +257,12 @@ class LlamoeAttention(nn.Module):
         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
         self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
-        self._init_rope()
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamoeRotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     def forward(
         self,
@@ -334,35 +277,17 @@ class LlamoeAttention(nn.Module):
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
-        if self.config.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
-            query_slices = self.q_proj.weight.split(
-                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
-            )
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         past_key_value = getattr(self, "past_key_value", past_key_value)
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
@@ -375,9 +300,10 @@ class LlamoeAttention(nn.Module):
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask
             if cache_position is not None:
                 causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
         # upcast attention to fp32
@@ -393,14 +319,8 @@ class LlamoeAttention(nn.Module):
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
         if not output_attentions:
             attn_weights = None
@@ -408,9 +328,10 @@ class LlamoeAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
-class LlamoeFlashAttention2(LlamoeAttention):
     """
-    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
@@ -423,6 +344,7 @@ class LlamoeFlashAttention2(LlamoeAttention):
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -449,8 +371,8 @@ class LlamoeFlashAttention2(LlamoeAttention):
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
         past_key_value = getattr(self, "past_key_value", past_key_value)
@@ -471,7 +393,7 @@ class LlamoeFlashAttention2(LlamoeAttention):
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in the correct dtype just to be sure everything works as expected.
         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (LlamaRMSNorm handles it correctly)
         input_dtype = query_states.dtype
         if input_dtype == torch.float32:
@@ -497,7 +419,7 @@ class LlamoeFlashAttention2(LlamoeAttention):
             query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
         )
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
         attn_output = self.o_proj(attn_output)
         if not output_attentions:
@@ -511,7 +433,6 @@ class LlamoeFlashAttention2(LlamoeAttention):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
@@ -530,7 +451,7 @@ class LlamoeFlashAttention2(LlamoeAttention):
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
             causal = self.is_causal and query_length != 1
         # Contains at least one padding token in the sequence
@@ -603,6 +524,7 @@ class LlamoeFlashAttention2(LlamoeAttention):
         )
 class LlamoeSdpaAttention(LlamoeAttention):
     """
     Gemmoe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -624,7 +546,7 @@ class LlamoeSdpaAttention(LlamoeAttention):
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "LlamoeModel is using LlamoeSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
@@ -660,9 +582,9 @@ class LlamoeSdpaAttention(LlamoeAttention):
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-        causal_mask = torch.tril(torch.ones((bsz, q_len, q_len), device=query_states.device))
-        causal_mask = causal_mask.to(dtype=query_states.dtype)
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
@@ -671,11 +593,6 @@ class LlamoeSdpaAttention(LlamoeAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
-        print("query:",query_states.shape)
-        print("keys:",key_states.shape)
-        print("values:",value_states.shape)
-        print("causal_mask:",causal_mask.shape)
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
@@ -690,23 +607,80 @@ class LlamoeSdpaAttention(LlamoeAttention):
         attn_output = self.o_proj(attn_output)
         return attn_output, None, past_key_value
 LLAMOE_ATTENTION_CLASSES = {
     "eager": LlamoeAttention,
     "flash_attention_2": LlamoeFlashAttention2,
-    "sdpa": LlamoeSdpaAttention
 }
 class LlamoeDecoderLayer(nn.Module):
     def __init__(self, config: LlamoeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = LLAMOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.block_sparse_moe = LlamoeBlockSparseTop2MLP(config)
         self.input_layernorm = LlamoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = LlamoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -777,18 +751,15 @@ class LlamoeDecoderLayer(nn.Module):
         return outputs
-LLAMOE_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
-        config ([`LlamaConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -796,11 +767,11 @@ LLAMOE_START_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Llamoe Model outputting raw hidden-states without any specific head on top.",
-    LLAMOE_START_DOCSTRING,
 )
-class LlammoePreTrainedModel(PreTrainedModel):
     config_class = LlamoeConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
@@ -844,7 +815,7 @@ class LlammoePreTrainedModel(PreTrainedModel):
             layer.self_attn.past_key_value = None
-LLAMOE_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -908,14 +879,14 @@ LLAMOE_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
     "The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
-    LLAMOE_START_DOCSTRING,
 )
-class LlamoeModel(LlammoePreTrainedModel):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamoeDecoderLayer`]
     Args:
-        config: LlamoeConfig
     """
     def __init__(self, config: LlamoeConfig):
@@ -945,7 +916,7 @@ class LlamoeModel(LlammoePreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    @add_start_docstrings_to_model_forward(LLAMOE_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1121,12 +1092,12 @@ class LlamoeModel(LlammoePreTrainedModel):
         return causal_mask
-class LlamoeForCausalLM(LlammoePreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
-        self.model = LlamoeModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.router_aux_loss_coef = config.router_aux_loss_coef
@@ -1153,7 +1124,7 @@ class LlamoeForCausalLM(LlammoePreTrainedModel):
     def get_decoder(self):
         return self.model
-    @add_start_docstrings_to_model_forward(LLAMOE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     # Ignore copy
     def forward(

     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_gemmoe import LlamoeConfig
 from math import sqrt as math_sqrt
 if is_flash_attn_2_available():
     _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "LlamoeConfig"
 def load_balancing_loss_func(
     gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
+def approx_gelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3)))
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
     return (
         indices,
         cu_seqlens,
         max_seqlen_in_batch,
     )
 class LlamoeRMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+    def _norm(self, x):
+        x_float = x.float()
+        normed_x = x_float * torch.rsqrt(x_float.pow(2).mean(-1, keepdim=True) + self.eps)
+        return normed_x
+    def forward(self, x):
+        normed_x = self._norm(x)
+        # Downcast the result to the original dtype at the end
+        normed_x = normed_x.type_as(x)
+        return normed_x * (self.weight + 1)
+ALL_LAYERNORM_LAYERS.append(GemmoeRMSNorm)
 class LlamoeRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
     hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class LlamoeAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
+    # Ignore copy
     def __init__(self, config: LlamoeConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
         self.num_key_value_heads = config.num_key_value_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
         self.is_causal = True
+        if self.hidden_size % self.num_heads != 0:
             raise ValueError(
                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                 f" and `num_heads`: {self.num_heads})."
         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
         self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self.rotary_emb = LlamoeRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
     def forward(
         self,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         past_key_value = getattr(self, "past_key_value", past_key_value)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
         if attention_mask is not None:  # no matter the length, we just slice it
             if cache_position is not None:
                 causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]]
+            else:
+                causal_mask = attention_mask
             attn_weights = attn_weights + causal_mask
         # upcast attention to fp32
         attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
         if not output_attentions:
             attn_weights = None
         return attn_output, attn_weights, past_key_value
+# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Gemmoe
+class LlamoeFlashAttention2(GemmoeAttention):
     """
+    Gemmoe flash attention module. This module inherits from `GemmoeAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    # Ignore copy
     def forward(
         self,
         hidden_states: torch.Tensor,
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
         past_key_value = getattr(self, "past_key_value", past_key_value)
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in the correct dtype just to be sure everything works as expected.
         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (GemmoeRMSNorm handles it correctly)
         input_dtype = query_states.dtype
         if input_dtype == torch.float32:
             query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
         )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
         attn_output = self.o_proj(attn_output)
         if not output_attentions:
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in GemmoeFlashAttention2 __init__.
             causal = self.is_causal and query_length != 1
         # Contains at least one padding token in the sequence
         )
+# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Gemmoe
 class LlamoeSdpaAttention(LlamoeAttention):
     """
     Gemmoe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
+                "GemmoeModel is using GemmoeSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None and cache_position is not None:
+            causal_mask = causal_mask[:, :, cache_position, : key_states.shape[-2]]
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
         attn_output = self.o_proj(attn_output)
         return attn_output, None, past_key_value
 LLAMOE_ATTENTION_CLASSES = {
     "eager": LlamoeAttention,
     "flash_attention_2": LlamoeFlashAttention2,
+    "sdpa": LlamoeSdpaAttention,
 }
+class LlamoeBlockSparseTop2MLP(nn.Module):
+    def __init__(self, config: GemmoeConfig):
+        super().__init__()
+        self.ffn_dim = config.intermediate_size
+        self.hidden_dim = config.hidden_size
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
+        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.act_fn = approx_gelu
+    def forward(self, hidden_states):
+        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states.to(hidden_states.dtype)
+class LlamoeSparseMoeBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.top_k = 2
+        # gating
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.experts = nn.ModuleList([LlamoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1)
+        topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
+        topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
+        hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
+        y = torch.empty_like(hidden_states)
+        flat_topk_idx = topk_idx.view(-1)
+        for i in range(self.num_experts):
+            expert = self.experts[i]
+            expert_output = expert(hidden_states[flat_topk_idx == i])
+            y[flat_topk_idx == i] = expert_output
+        y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+        final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states.to(hidden_states.dtype), router_logits.to(hidden_states.dtype)
+# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with LLAMA->GEMMOE,Llama->Gemmoe
 class LlamoeDecoderLayer(nn.Module):
     def __init__(self, config: LlamoeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = Llamoe_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.block_sparse_moe = LlamoeSparseMoeBlock(config)
         self.input_layernorm = LlamoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = LlamoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         return outputs
+Llamoe_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
+        config ([`GemmoeConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 @add_start_docstrings(
+    "The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
+    Llamoe_START_DOCSTRING,
 )
+class LlamoePreTrainedModel(PreTrainedModel):
     config_class = LlamoeConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
             layer.self_attn.past_key_value = None
+Llamoe_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 @add_start_docstrings(
     "The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
+    Llamoe_START_DOCSTRING,
 )
+class LlamoeModel(LlamoePreTrainedModel):
     """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GemmoeDecoderLayer`]
     Args:
+        config: GemmoeConfig
     """
     def __init__(self, config: LlamoeConfig):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(Llamoe_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         return causal_mask
+class LlamoeForCausalLM(LlamoePreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
+        self.model = GemmoeModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.router_aux_loss_coef = config.router_aux_loss_coef
     def get_decoder(self):
         return self.model
+    @add_start_docstrings_to_model_forward(GEMMOE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     # Ignore copy
     def forward(