KitsuVp
/

NeoLLM

@@ -631,7 +631,6 @@ class NeoLLMMLP(nn.Module):
         hidden = self.dropout(hidden)
         return self.down_proj(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
     Decoder layer with standard residual connections.
@@ -677,8 +676,9 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
         first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> torch.FloatTensor:
         # ============================================================
         # Attention Block with standard residual connection
         # ============================================================
@@ -691,7 +691,8 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         hidden_states = self.lns_attn(hidden_states)
         # Self Attention with ResFormer feature residual connections and learnable multipliers
-        hidden_states, _, self.current_layer_fan = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_embeddings=position_embeddings,
@@ -723,7 +724,11 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # Apply GPAS after MLP residual connection
         hidden_states = self.gpas_mlp(hidden_states)
-        return hidden_states
 class NeoLLMPreTrainedModel(PreTrainedModel):
@@ -788,6 +793,7 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
             # scale adaptations from data without initial bias
             if hasattr(module, 'multiplier'):
                 module.multiplier.data.fill_(1.0)
 class NeoLLMModel(NeoLLMPreTrainedModel):
     """
     NeoLLM base model with transformer decoder architecture.
@@ -842,6 +848,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         position_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
@@ -849,6 +856,10 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             output_hidden_states if output_hidden_states is not None
             else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -875,6 +886,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         hidden_states = inputs_embeds
         all_hidden_states = () if output_hidden_states else None
         # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
@@ -886,14 +898,20 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
-            hidden_states = decoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings,
                 attention_mask=causal_mask,
                 first_layer_fan=self.first_layer_fan,  # Pass H_fan_1 to all layers
                 **kwargs,
             )
             # ResFormer: capture H_fan_1 from the first layer
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
                 self.first_layer_fan = decoder_layer.current_layer_fan
@@ -905,12 +923,13 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             all_hidden_states = all_hidden_states + (hidden_states,)
         if not return_dict:
-            return tuple(v for v in [hidden_states, None, all_hidden_states] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=None,
             hidden_states=all_hidden_states,
         )

         hidden = self.dropout(hidden)
         return self.down_proj(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
     Decoder layer with standard residual connections.
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
         first_layer_fan: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
         **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         # ============================================================
         # Attention Block with standard residual connection
         # ============================================================
         hidden_states = self.lns_attn(hidden_states)
         # Self Attention with ResFormer feature residual connections and learnable multipliers
+        # We capture attn_weights here instead of ignoring them
+        hidden_states, attn_weights, self.current_layer_fan = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_embeddings=position_embeddings,
         # Apply GPAS after MLP residual connection
         hidden_states = self.gpas_mlp(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
 class NeoLLMPreTrainedModel(PreTrainedModel):
             # scale adaptations from data without initial bias
             if hasattr(module, 'multiplier'):
                 module.multiplier.data.fill_(1.0)
 class NeoLLMModel(NeoLLMPreTrainedModel):
     """
     NeoLLM base model with transformer decoder architecture.
         position_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
             output_hidden_states if output_hidden_states is not None
             else self.config.output_hidden_states
         )
+        output_attentions = (
+            output_attentions if output_attentions is not None
+            else self.config.output_attentions
+        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
         hidden_states = inputs_embeds
         all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
         # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_outputs = decoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings,
                 attention_mask=causal_mask,
                 first_layer_fan=self.first_layer_fan,  # Pass H_fan_1 to all layers
+                output_attentions=output_attentions,
                 **kwargs,
             )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
             # ResFormer: capture H_fan_1 from the first layer
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
                 self.first_layer_fan = decoder_layer.current_layer_fan
             all_hidden_states = all_hidden_states + (hidden_states,)
         if not return_dict:
+            return tuple(v for v in [hidden_states, None, all_hidden_states, all_attentions] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=None,
             hidden_states=all_hidden_states,
+            attentions=all_attentions,
         )