KitsuVp
/

NeoLLM

@@ -37,7 +37,7 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
 from transformers.utils.generic import check_model_inputs
-from .configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
@@ -325,8 +325,6 @@ class SeeDNorm(nn.Module):
 # ==================== STACK MEMORY MODULE ====================
 class StackMemory(nn.Module):
     """
-    Differentiable Hidden State Stack for modeling Chomsky hierarchy grammars.
     From "Improving Formal Reasoning of Transformer with State Stack":
     Implements a multi-head differentiable stack with soft push, pop, and no-op operations.
     Each head maintains its own stack and mask, which are updated based on learned action
@@ -354,8 +352,8 @@ class StackMemory(nn.Module):
         # Dimension reduction projections for efficiency
         # Uses standard nn.Linear
-        self.down_proj = nn.Linear(config.hidden_size, self.stack_d_model, bias=False)
-        self.up_proj = nn.Linear(self.stack_d_model, config.hidden_size, bias=False)
         # Action prediction: generates push/pop/no-op probabilities for each head
         self.action_head = nn.Linear(self.stack_d_model, 3 * self.num_stack_heads, bias=True)
@@ -365,6 +363,20 @@ class StackMemory(nn.Module):
         # Residual weight for gating stack contribution
         self.res_weight = nn.Parameter(torch.ones(1))
     def _vectorized_update(
         self,
@@ -393,8 +405,10 @@ class StackMemory(nn.Module):
         batch_size, seq_len = actions.shape[:2]
         # Expand stack and mask along sequence dimension for parallel processing
-        stack = stack.unsqueeze(1).expand(-1, seq_len, -1, -1, -1)
-        mask = mask.unsqueeze(1).expand(-1, seq_len, -1, -1)
         # Generate pushed stack: new value at top, shift others down
         push_stack = torch.cat([
@@ -476,33 +490,93 @@ class StackMemory(nn.Module):
         new_stack, new_mask = self._vectorized_update(stack, mask, actions, k_values)
         # Global reading via query-over-stack attention
-        # FIX: Project the raw stack content directly.
-        # Previously, masking before projection killed gradients for "empty" slots
-        # preventing them from ever becoming "full".
         gate_scores = self.gate_proj(new_stack).squeeze(-1)  # [batch, seq, heads, slots]
-        # Apply mask to the SCORES, not the features.
-        # Mask out invalid positions (add large negative value where mask is 0)
-        gate_scores = gate_scores + (1 - new_mask) * -1e9
-        # Softmax to get attention weights
-        gate_weights = F.softmax(gate_scores, dim=-1)
         # Weighted sum over stack slots
-        # new_stack contains the features, gate_weights contains the validity/relevance
         memory_output = (new_stack * gate_weights.unsqueeze(-1)).sum(dim=3)
         memory_output = memory_output.view(batch_size, seq_len, -1)
-        # Project back to original dimension
         memory_output = self.up_proj(memory_output)
-        # Gated residual connection
         output = memory_output * self.res_weight + hidden_states
-        # Return output and updated stack state (use last timestep's state)
         return output, new_stack[:, -1], new_mask[:, -1]
 # ==================== ROTARY EMBEDDING ====================
 class NeoLLMRotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor  # fix linting for `register_buffer`
@@ -1119,8 +1193,8 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        past_stack_state: Optional[torch.Tensor] = None,
-        past_stack_mask: Optional[torch.Tensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         output_hidden_states = (
@@ -1152,6 +1226,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         )
         hidden_states = inputs_embeds
         all_hidden_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -1161,9 +1236,17 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         # ResFormer with first-layer feature propagation
         self.first_layer_fan = None
-        # Initialize Stack states
-        stack_state = past_stack_state
-        stack_mask = past_stack_mask
         for decoder_layer in self.layers:
             if output_hidden_states:
@@ -1186,6 +1269,9 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 all_attentions = all_attentions + (layer_outputs[1],)
             if self.use_stack:
                 stack_state = layer_outputs[2]
                 stack_mask = layer_outputs[3]
@@ -1199,18 +1285,13 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
-        # Construct the persistence tuple (Stack only)
-        next_cache = None
-        if self.use_stack:
-            next_cache = (stack_state, stack_mask)
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attentions] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=next_cache,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )
@@ -1268,29 +1349,34 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
-        # Extract custom states from past_key_values if present
-        # Structure: (stack_state, stack_mask)
-        past_stack_state = None
-        past_stack_mask = None
-        if past_key_values is not None:
-            # We use the past_key_values as a container for our custom states
-            if len(past_key_values) == 2:
-                past_stack_state, past_stack_mask = past_key_values
-            # Helper for generation loop: input_ids should be just the last token if we have past
-            input_ids = input_ids[:, -1:]
-        model_inputs = {
             "input_ids": input_ids,
-            "past_stack_state": past_stack_state,
-            "past_stack_mask": past_stack_mask,
             "use_cache": kwargs.get("use_cache"),
-            "position_ids": kwargs.get("position_ids", None),
             "attention_mask": attention_mask,
             "inputs_embeds": inputs_embeds,
         }
-        return model_inputs
     def forward(
         self,
@@ -1302,8 +1388,7 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         logits_to_keep: Union[int, torch.Tensor] = 0,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        past_stack_state: Optional[torch.Tensor] = None,
-        past_stack_mask: Optional[torch.Tensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         outputs: BaseModelOutputWithPast = self.model(
@@ -1313,8 +1398,7 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             inputs_embeds=inputs_embeds,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
-            past_stack_state=past_stack_state,
-            past_stack_mask=past_stack_mask,
             **kwargs,
         )

 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
 from transformers.utils.generic import check_model_inputs
+from configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 # ==================== STACK MEMORY MODULE ====================
 class StackMemory(nn.Module):
     """
     From "Improving Formal Reasoning of Transformer with State Stack":
     Implements a multi-head differentiable stack with soft push, pop, and no-op operations.
     Each head maintains its own stack and mask, which are updated based on learned action
         # Dimension reduction projections for efficiency
         # Uses standard nn.Linear
+        self.down_proj = nn.Linear(config.hidden_size, self.stack_d_model, bias=True)
+        self.up_proj = nn.Linear(self.stack_d_model, config.hidden_size, bias=True)
         # Action prediction: generates push/pop/no-op probabilities for each head
         self.action_head = nn.Linear(self.stack_d_model, 3 * self.num_stack_heads, bias=True)
         # Residual weight for gating stack contribution
         self.res_weight = nn.Parameter(torch.ones(1))
+        # Cache for autoregressive generation (matches OLMo reference)
+        self.cache_size = getattr(config, "cache_size", 2048)
+        # Initialization fix: Register buffers for cache
+        # Default to batch_size=1 if forward_bs is not in config (standard inference)
+        forward_bs = getattr(config, 'forward_bs', 1)
+        self.register_buffer("k_cache", torch.zeros(forward_bs, self.cache_size, self.num_stack_heads, self.head_dim))
+        self.register_buffer("action_cache", torch.zeros(forward_bs, self.cache_size, self.num_stack_heads, 3))
+        self.cache_position = 0
+        self.enable_cache = False
+    def reset_cache(self):
+        self.cache_position = 0
     def _vectorized_update(
         self,
         batch_size, seq_len = actions.shape[:2]
         # Expand stack and mask along sequence dimension for parallel processing
+        # Only expand if checking against initial state dimensions (4D)
+        if stack.dim() == 4:
+            stack = stack.unsqueeze(1).expand(-1, seq_len, -1, -1, -1)
+            mask = mask.unsqueeze(1).expand(-1, seq_len, -1, -1)
         # Generate pushed stack: new value at top, shift others down
         push_stack = torch.cat([
         new_stack, new_mask = self._vectorized_update(stack, mask, actions, k_values)
         # Global reading via query-over-stack attention
         gate_scores = self.gate_proj(new_stack).squeeze(-1)  # [batch, seq, heads, slots]
+        gate_weights = F.softmax(gate_scores + (1 - new_mask) * -1e9, dim=-1)
         # Weighted sum over stack slots
         memory_output = (new_stack * gate_weights.unsqueeze(-1)).sum(dim=3)
         memory_output = memory_output.view(batch_size, seq_len, -1)
         memory_output = self.up_proj(memory_output)
+        # Residual Connection
         output = memory_output * self.res_weight + hidden_states
+        # Update Cache Logic
+        if self.enable_cache:
+            self._update_cache(k_values.detach(), actions.detach())
         return output, new_stack[:, -1], new_mask[:, -1]
+    def _update_cache(self, k_values: torch.Tensor, actions: torch.Tensor):
+        seq_len = k_values.shape[1]
+        if self.cache_position + seq_len <= self.cache_size:
+            # Assumes standard batch processing for inference (usually batch_size=1)
+            self.k_cache[:, self.cache_position:self.cache_position+seq_len] = k_values
+            self.action_cache[:, self.cache_position:self.cache_position+seq_len] = actions
+            self.cache_position += seq_len
+        else:
+            self.reset_cache()
+    def step(self, hidden_state: torch.Tensor, stack: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if not self.enable_cache:
+            return self.forward(hidden_state.unsqueeze(1), stack, mask)
+        batch_size = hidden_state.shape[0]
+        # Compute features for current token
+        new_hidden_states = self.down_proj(hidden_state)
+        action_logits = self.action_head(new_hidden_states) / math.sqrt(self.head_dim)
+        current_actions = F.softmax(
+            action_logits.view(batch_size, 1, self.num_stack_heads, 3),
+            dim=-1
+        )
+        current_k = new_hidden_states.view(batch_size, 1, self.num_stack_heads, self.head_dim)
+        # Reconstruct History
+        if self.cache_position > 0:
+            cached_k = self.k_cache[:, :self.cache_position]
+            cached_actions = self.action_cache[:, :self.cache_position]
+            k_values = torch.cat([cached_k, current_k], dim=1)
+            actions = torch.cat([cached_actions, current_actions], dim=1)
+        else:
+            k_values = current_k
+            actions = current_actions
+        # Dimension Fix: Pass sequences directly without unsqueeze(0)
+        # k_values is [batch, seq_len_total, heads, dim]
+        # actions is [batch, seq_len_total, heads, 3]
+        new_stack_seq, new_mask_seq = self._vectorized_update(
+            stack, # Initial stack [batch, heads, slots, dim]
+            mask,
+            actions,
+            k_values
+        )
+        # Extract last step
+        current_stack = new_stack_seq[:, -1]
+        current_mask = new_mask_seq[:, -1]
+        gate_scores = self.gate_proj(current_stack).squeeze(-1)
+        gate_weights = F.softmax(gate_scores + (1 - current_mask) * -1e9, dim=-1)
+        memory_output = (current_stack * gate_weights.unsqueeze(-1)).sum(dim=2)
+        memory_output = memory_output.view(batch_size, -1)
+        memory_output_proj = self.up_proj(memory_output)
+        self._update_cache(current_k, current_actions)
+        return (
+            memory_output_proj * self.res_weight + hidden_state,
+            current_stack,
+            current_mask
+        )
 # ==================== ROTARY EMBEDDING ====================
 class NeoLLMRotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor  # fix linting for `register_buffer`
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         output_hidden_states = (
         )
         hidden_states = inputs_embeds
+        next_decoder_cache = None
         all_hidden_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
         # ResFormer with first-layer feature propagation
         self.first_layer_fan = None
+        # Initialize Stack states (always None at start of forward, rebuilt via cache step or vertical flow)
+        stack_state = None
+        stack_mask = None
+        # Propagate use_cache and reset if starting a new sequence
+        if self.use_stack:
+            for layer in self.layers:
+                if hasattr(layer, 'stack_memory'):
+                    layer.stack_memory.enable_cache = use_cache if use_cache is not None else False
+                    if past_key_values is None:
+                        layer.stack_memory.reset_cache()
         for decoder_layer in self.layers:
             if output_hidden_states:
                 all_attentions = all_attentions + (layer_outputs[1],)
             if self.use_stack:
+                # Vertical memory logic:
+                # The layer returns updated stack for the next layer to use (Vertical passing)
+                # But we do NOT persist it temporally here. The Module's internal cache handles temporal.
                 stack_state = layer_outputs[2]
                 stack_mask = layer_outputs[3]
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
         if not return_dict:
+            return tuple(v for v in [hidden_states, next_decoder_cache, all_hidden_states, all_attentions] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[2]
+            # If past_length > input_ids length, we are likely generating token by token
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default standard HF behavior
+                remove_prefix_length = input_ids.shape[1] - 1
+            input_ids = input_ids[:, remove_prefix_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        return {
             "input_ids": input_ids,
+            "past_key_values": past_key_values,
             "use_cache": kwargs.get("use_cache"),
+            "position_ids": position_ids,
             "attention_mask": attention_mask,
             "inputs_embeds": inputs_embeds,
         }
     def forward(
         self,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         outputs: BaseModelOutputWithPast = self.model(
             inputs_embeds=inputs_embeds,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             **kwargs,
         )