KitsuVp
/

NeoLLM

@@ -1121,7 +1121,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         return_dict: Optional[bool] = None,
         past_stack_state: Optional[torch.Tensor] = None,
         past_stack_mask: Optional[torch.Tensor] = None,
-        past_first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         output_hidden_states = (
@@ -1160,8 +1159,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
         # ResFormer with first-layer feature propagation
-        # Retrieve persistent ResFormer state if provided (for inference)
-        self.first_layer_fan = past_first_layer_fan
         # Initialize Stack states
         stack_state = past_stack_state
@@ -1192,13 +1190,8 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 stack_mask = layer_outputs[3]
             # ResFormer: capture H_fan_1 from the first layer
-            # If we didn't have it (and it wasn't passed via past), capture it now.
-            # For inference, if we just computed the prompt/first token, we keep it.
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
-                # Crucial: For persistence, we might want to slice this if it's the prompt?
-                # But logic says: reuse same tensor. If seq_len > 1, it's prompt.
-                # If seq_len == 1, it's generation.
-                # If we are starting fresh (None), we capture what we have.
                 self.first_layer_fan = decoder_layer.current_layer_fan
         # Apply SeeDNorm for final normalization
@@ -1207,15 +1200,10 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
-        # Construct the persistence tuple (Stack + Fan)
-        # Note: We do not implement full KV cache yet, but we persist these states.
         next_cache = None
-        if self.use_stack or self.first_layer_fan is not None:
-            # Capture the first token's FAN for ResFormer persistence
-            # If we have a sequence, we probably want to keep the FIRST token's fan for consistency?
-            # Or just keep the whole thing? The requirement is "reutilizar".
-            # We keep the object self.first_layer_fan.
-            next_cache = (stack_state, stack_mask, self.first_layer_fan)
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attentions] if v is not None)
@@ -1281,16 +1269,14 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         # Extract custom states from past_key_values if present
-        # Structure: (stack_state, stack_mask, first_layer_fan)
         past_stack_state = None
         past_stack_mask = None
-        past_first_layer_fan = None
         if past_key_values is not None:
             # We use the past_key_values as a container for our custom states
-            # Since we don't have standard KV cache yet, it should just be our tuple
-            if len(past_key_values) == 3:
-                past_stack_state, past_stack_mask, past_first_layer_fan = past_key_values
             # Helper for generation loop: input_ids should be just the last token if we have past
             input_ids = input_ids[:, -1:]
@@ -1299,7 +1285,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             "input_ids": input_ids,
             "past_stack_state": past_stack_state,
             "past_stack_mask": past_stack_mask,
-            "past_first_layer_fan": past_first_layer_fan,
             "use_cache": kwargs.get("use_cache"),
             "position_ids": kwargs.get("position_ids", None),
             "attention_mask": attention_mask,
@@ -1319,7 +1304,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         return_dict: Optional[bool] = None,
         past_stack_state: Optional[torch.Tensor] = None,
         past_stack_mask: Optional[torch.Tensor] = None,
-        past_first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         outputs: BaseModelOutputWithPast = self.model(
@@ -1331,7 +1315,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             return_dict=return_dict,
             past_stack_state=past_stack_state,
             past_stack_mask=past_stack_mask,
-            past_first_layer_fan=past_first_layer_fan,
             **kwargs,
         )

         return_dict: Optional[bool] = None,
         past_stack_state: Optional[torch.Tensor] = None,
         past_stack_mask: Optional[torch.Tensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         output_hidden_states = (
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
         # ResFormer with first-layer feature propagation
+        self.first_layer_fan = None
         # Initialize Stack states
         stack_state = past_stack_state
                 stack_mask = layer_outputs[3]
             # ResFormer: capture H_fan_1 from the first layer
+            # Dynamically capture for the current pass
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
                 self.first_layer_fan = decoder_layer.current_layer_fan
         # Apply SeeDNorm for final normalization
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
+        # Construct the persistence tuple (Stack only)
         next_cache = None
+        if self.use_stack:
+            next_cache = (stack_state, stack_mask)
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attentions] if v is not None)
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         # Extract custom states from past_key_values if present
+        # Structure: (stack_state, stack_mask)
         past_stack_state = None
         past_stack_mask = None
         if past_key_values is not None:
             # We use the past_key_values as a container for our custom states
+            if len(past_key_values) == 2:
+                past_stack_state, past_stack_mask = past_key_values
             # Helper for generation loop: input_ids should be just the last token if we have past
             input_ids = input_ids[:, -1:]
             "input_ids": input_ids,
             "past_stack_state": past_stack_state,
             "past_stack_mask": past_stack_mask,
             "use_cache": kwargs.get("use_cache"),
             "position_ids": kwargs.get("position_ids", None),
             "attention_mask": attention_mask,
         return_dict: Optional[bool] = None,
         past_stack_state: Optional[torch.Tensor] = None,
         past_stack_mask: Optional[torch.Tensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         outputs: BaseModelOutputWithPast = self.model(
             return_dict=return_dict,
             past_stack_state=past_stack_state,
             past_stack_mask=past_stack_mask,
             **kwargs,
         )