KitsuVp
/

NeoLLM

@@ -37,7 +37,7 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
 from transformers.utils.generic import check_model_inputs
-from .configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
@@ -1119,6 +1119,9 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         output_hidden_states = (
@@ -1157,9 +1160,12 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
         # ResFormer with first-layer feature propagation
-        self.first_layer_fan = None
-        stack_state = None
-        stack_mask = None
         for decoder_layer in self.layers:
             if output_hidden_states:
@@ -1186,7 +1192,13 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 stack_mask = layer_outputs[3]
             # ResFormer: capture H_fan_1 from the first layer
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
                 self.first_layer_fan = decoder_layer.current_layer_fan
         # Apply SeeDNorm for final normalization
@@ -1194,13 +1206,23 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
         if not return_dict:
-            return tuple(v for v in [hidden_states, None, all_hidden_states, all_attentions] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=None,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )
@@ -1255,6 +1277,36 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         self.post_init()
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1265,6 +1317,9 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         logits_to_keep: Union[int, torch.Tensor] = 0,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         outputs: BaseModelOutputWithPast = self.model(
@@ -1274,6 +1329,9 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             inputs_embeds=inputs_embeds,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             **kwargs,
         )
@@ -1298,7 +1356,7 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
-            past_key_values=None,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )

 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
 from transformers.utils.generic import check_model_inputs
+from configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        past_stack_state: Optional[torch.Tensor] = None,
+        past_stack_mask: Optional[torch.Tensor] = None,
+        past_first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         output_hidden_states = (
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
         # ResFormer with first-layer feature propagation
+        # Retrieve persistent ResFormer state if provided (for inference)
+        self.first_layer_fan = past_first_layer_fan
+        # Initialize Stack states
+        stack_state = past_stack_state
+        stack_mask = past_stack_mask
         for decoder_layer in self.layers:
             if output_hidden_states:
                 stack_mask = layer_outputs[3]
             # ResFormer: capture H_fan_1 from the first layer
+            # If we didn't have it (and it wasn't passed via past), capture it now.
+            # For inference, if we just computed the prompt/first token, we keep it.
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
+                # Crucial: For persistence, we might want to slice this if it's the prompt?
+                # But logic says: reuse same tensor. If seq_len > 1, it's prompt.
+                # If seq_len == 1, it's generation.
+                # If we are starting fresh (None), we capture what we have.
                 self.first_layer_fan = decoder_layer.current_layer_fan
         # Apply SeeDNorm for final normalization
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
+        # Construct the persistence tuple (Stack + Fan)
+        # Note: We do not implement full KV cache yet, but we persist these states.
+        next_cache = None
+        if self.use_stack or self.first_layer_fan is not None:
+            # Capture the first token's FAN for ResFormer persistence
+            # If we have a sequence, we probably want to keep the FIRST token's fan for consistency?
+            # Or just keep the whole thing? The requirement is "reutilizar".
+            # We keep the object self.first_layer_fan.
+            next_cache = (stack_state, stack_mask, self.first_layer_fan)
         if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attentions] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
+            past_key_values=next_cache,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )
         self.post_init()
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Extract custom states from past_key_values if present
+        # Structure: (stack_state, stack_mask, first_layer_fan)
+        past_stack_state = None
+        past_stack_mask = None
+        past_first_layer_fan = None
+        if past_key_values is not None:
+            # We use the past_key_values as a container for our custom states
+            # Since we don't have standard KV cache yet, it should just be our tuple
+            if len(past_key_values) == 3:
+                past_stack_state, past_stack_mask, past_first_layer_fan = past_key_values
+            # Helper for generation loop: input_ids should be just the last token if we have past
+            input_ids = input_ids[:, -1:]
+        model_inputs = {
+            "input_ids": input_ids,
+            "past_stack_state": past_stack_state,
+            "past_stack_mask": past_stack_mask,
+            "past_first_layer_fan": past_first_layer_fan,
+            "use_cache": kwargs.get("use_cache"),
+            "position_ids": kwargs.get("position_ids", None),
+            "attention_mask": attention_mask,
+            "inputs_embeds": inputs_embeds,
+        }
+        return model_inputs
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        past_stack_state: Optional[torch.Tensor] = None,
+        past_stack_mask: Optional[torch.Tensor] = None,
+        past_first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         outputs: BaseModelOutputWithPast = self.model(
             inputs_embeds=inputs_embeds,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            past_stack_state=past_stack_state,
+            past_stack_mask=past_stack_mask,
+            past_first_layer_fan=past_first_layer_fan,
             **kwargs,
         )
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
+            past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )