KitsuVp
/

NeoLLM

@@ -12,7 +12,6 @@ import torch
 import torch.nn.functional as F
 from torch import nn
 from cut_cross_entropy import linear_cross_entropy
-from .configuration_neollm import NeoLLMConfig
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
@@ -29,8 +28,8 @@ from transformers.utils.import_utils import (
     is_causal_conv1d_available,
     is_flash_linear_attention_available,
 )
-from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 if is_causal_conv1d_available():
     from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
@@ -43,6 +42,7 @@ if is_flash_linear_attention_available():
 else:
     chunk_gated_delta_rule, fused_recurrent_gated_delta_rule = None, None
     FusedRMSNormGated = None
 logger = logging.get_logger(__name__)
@@ -737,25 +737,28 @@ class PolyNorm(torch.nn.Module):
     def forward(self, x):
         return self.weight[0] * self._norm(x**3) + self.weight[1] * self._norm(x**2) + self.weight[2] * self._norm(x) + self.bias
 class NeoLLMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
-        self.linear1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.linear2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = PolyNorm()
         # Dropout for MLP hidden layer
         self.dropout = nn.Dropout(config.dropout_rate)
     def forward(self, x):
-        hidden = self.act_fn(self.linear1(x))
-        hidden = self.dropout(hidden)  # Apply dropout after activation
-        return self.linear2(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
@@ -931,46 +934,44 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         if attention_mask is not None and torch.all(attention_mask == 1):
             linear_attn_mask = None
         return linear_attn_mask
 class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
         self.model = NeoLLMModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        # Initialize weights and apply final processing
         self.post_init()
-    @torch.compiler.disable
-    def _compute_cce_loss(self, hidden_states, labels):
-        """
-        CCE loss computation excluded from compilation.
-        Preprocesses labels to eliminate torch.compile warnings.
-        """
-        # Ensure labels are on the correct device
-        processed_labels = labels.to(hidden_states.device)
-        # Handle pad tokens: convert pad_token_id to -100 for proper masking
-        if self.config.pad_token_id is not None:
-            processed_labels = torch.where(
-                processed_labels == self.config.pad_token_id,
-                torch.tensor(-100, dtype=processed_labels.dtype, device=processed_labels.device),
-                processed_labels
-            )
-        return linear_cross_entropy(
-            hidden_states,
-            self.lm_head.weight,
-            processed_labels,  # Use preprocessed labels
-            bias=getattr(self.lm_head, 'bias', None),
-            shift=1,
-            impl="cce",
-            reduction="mean"
-        )
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -981,14 +982,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-        """
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -996,19 +989,25 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             inputs_embeds=inputs_embeds,
             **kwargs,
         )
         hidden_states = outputs.last_hidden_state
         # CCE Loss computation for training
         if labels is not None:
-            loss = self._compute_cce_loss(hidden_states, labels)
-            logits = None  # CCE doesn't return logits to save memory
         else:
             # Inference mode - compute logits normally
             slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
             logits = self.lm_head(hidden_states[:, slice_indices, :])
             loss = None
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -1016,9 +1015,17 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
 # ==================== AUTOMODEL REGISTRATION ====================
 # Register the configuration and model for AutoClass support
 AutoConfig.register("neollm", NeoLLMConfig)
 AutoModel.register(NeoLLMConfig, NeoLLMModel)

 import torch.nn.functional as F
 from torch import nn
 from cut_cross_entropy import linear_cross_entropy
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
     is_causal_conv1d_available,
     is_flash_linear_attention_available,
 )
+from .configuration_neollm import NeoLLMConfig
 if is_causal_conv1d_available():
     from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
 else:
     chunk_gated_delta_rule, fused_recurrent_gated_delta_rule = None, None
     FusedRMSNormGated = None
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
     def forward(self, x):
         return self.weight[0] * self._norm(x**3) + self.weight[1] * self._norm(x**2) + self.weight[2] * self._norm(x) + self.bias
 class NeoLLMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
+        # SwiGLU/Gated architecture like Motif - sin bias como el original
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = PolyNorm()
         # Dropout for MLP hidden layer
         self.dropout = nn.Dropout(config.dropout_rate)
     def forward(self, x):
+        gate_output = self.act_fn(self.gate_proj(x))
+        up_output = self.up_proj(x)
+        hidden = gate_output * up_output
+        hidden = self.dropout(hidden)
+        return self.down_proj(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         if attention_mask is not None and torch.all(attention_mask == 1):
             linear_attn_mask = None
         return linear_attn_mask
+@torch.compiler.disable
+def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, pad_token_id=None):
+    """
+    CCE loss computation excluded from compilation.
+    Preprocesses labels to eliminate torch.compile warnings.
+    """
+    # Ensure labels are on the correct device
+    processed_labels = labels.to(hidden_states.device)
+    # Handle pad tokens: convert pad_token_id to -100 for proper masking
+    if pad_token_id is not None:
+        processed_labels = torch.where(
+            processed_labels == pad_token_id,
+            torch.tensor(-100, dtype=processed_labels.dtype, device=processed_labels.device),
+            processed_labels
+        )
+    return linear_cross_entropy(
+        hidden_states,
+        lm_head_weight,
+        processed_labels,
+        bias=lm_head_bias,
+        shift=1,
+        impl="cce",
+        reduction="mean"
+    )
 class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
         self.model = NeoLLMModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
             **kwargs,
         )
         hidden_states = outputs.last_hidden_state
         # CCE Loss computation for training
         if labels is not None:
+            loss = compute_cce_loss(
+                hidden_states,
+                labels,
+                self.lm_head.weight,
+                getattr(self.lm_head, 'bias', None),
+                self.config.pad_token_id
+            )
+            logits = None
         else:
             # Inference mode - compute logits normally
             slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
             logits = self.lm_head(hidden_states[:, slice_indices, :])
             loss = None
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
 # ==================== AUTOMODEL REGISTRATION ====================
+__all__ = [
+    "NeoLLMForCausalLM",
+    "NeoLLMModel",
+    "NeoLLMPreTrainedModel",
+    "NeoLLMConfig",
+    "FANLayer",
+]
 # Register the configuration and model for AutoClass support
 AutoConfig.register("neollm", NeoLLMConfig)
 AutoModel.register(NeoLLMConfig, NeoLLMModel)