botverse
/

Titans-MAC-Llama-3-8b-Instruct

@@ -94,6 +94,7 @@ class InferenceMemoryWrapper(PreTrainedModel):
     def apply_surprise_update(self):
         """ Applies the TITANS-style surprise update rule using self.memory_buffer.grad """
         if self.memory_buffer.grad is None:
             return
         # Ensure surprise_state is on the same device and dtype
@@ -182,12 +183,14 @@ class InferenceMemoryWrapper(PreTrainedModel):
             print("Warning: update_rule='surprise' requires use_memory=True.")
             update_rule = 'none'
         if update_rule == 'surprise':
              self.memory_buffer.requires_grad_(True)
         else:
-             # Ensure no grads are computed if not needed
-             # Note: Llama part is already frozen and in eval mode
-             pass # No specific action needed if not surprise
         bsz, seq_len_start = input_ids.shape
         device = input_ids.device
@@ -278,16 +281,19 @@ class InferenceMemoryWrapper(PreTrainedModel):
             # We need past_key_values AND not be doing surprise update AND base model supports caching
             use_kv_cache_this_step = past_key_values is not None and update_rule != 'surprise' and self.llama.config.use_cache
-            outputs = self.llama(
-                input_ids=cur_input_ids_for_llama, # None if using embeds
-                inputs_embeds=model_inputs_embeds,
-                attention_mask=current_mask, # Pass the correctly shaped mask for this step
-                position_ids=position_ids, # Pass adjusted position IDs
-                past_key_values=past_key_values,
-                use_cache=use_kv_cache_this_step,
-                output_hidden_states=True, # Needed for query/target/update
-                return_dict=True,
-            )
             # --- Associative Loss Calculation (if surprise update) ---
             if update_rule == 'surprise' and use_memory and retrieved_mem is not None:
@@ -299,12 +305,33 @@ class InferenceMemoryWrapper(PreTrainedModel):
                  # pred_repr comes from retrieve_memory, should already match buffer dtype
                  pred_repr = retrieved_mem.squeeze(1) # (B, C)
-                 assoc_loss = F.mse_loss(pred_repr, target_repr.detach())
                  if self.memory_buffer.grad is not None:
                       self.memory_buffer.grad.zero_()
-                 assoc_loss.backward() # Compute grads for memory_buffer
-                 self.apply_surprise_update() # Apply update and zero grad
             # --- Standard Generation Logic ---
             # Get logits for the very last position in the output sequence (corresponds to the token we just fed in)
@@ -362,7 +389,8 @@ class InferenceMemoryWrapper(PreTrainedModel):
             if eos_token_id is not None and (next_token == eos_token_id).all():
                 break
-        # self.eval() # Already in eval mode if llama is frozen
         return generated_ids

     def apply_surprise_update(self):
         """ Applies the TITANS-style surprise update rule using self.memory_buffer.grad """
         if self.memory_buffer.grad is None:
+            print("DEBUG: apply_surprise_update called but memory_buffer.grad is None.")
             return
         # Ensure surprise_state is on the same device and dtype
             print("Warning: update_rule='surprise' requires use_memory=True.")
             update_rule = 'none'
+        # Ensure buffer requires grad only when needed
+        original_requires_grad = self.memory_buffer.requires_grad
         if update_rule == 'surprise':
              self.memory_buffer.requires_grad_(True)
+             print(f"DEBUG: Set memory_buffer.requires_grad = {self.memory_buffer.requires_grad}")
         else:
+             self.memory_buffer.requires_grad_(False)
         bsz, seq_len_start = input_ids.shape
         device = input_ids.device
             # We need past_key_values AND not be doing surprise update AND base model supports caching
             use_kv_cache_this_step = past_key_values is not None and update_rule != 'surprise' and self.llama.config.use_cache
+            # Ensure context manager enables grads only when needed
+            context = torch.enable_grad() if update_rule == 'surprise' else torch.no_grad()
+            with context:
+                outputs = self.llama(
+                    input_ids=cur_input_ids_for_llama, # None if using embeds
+                    inputs_embeds=model_inputs_embeds,
+                    attention_mask=current_mask, # Pass the correctly shaped mask for this step
+                    position_ids=position_ids, # Pass adjusted position IDs
+                    past_key_values=past_key_values,
+                    use_cache=use_kv_cache_this_step,
+                    output_hidden_states=True, # Needed for query/target/update
+                    return_dict=True,
+                )
             # --- Associative Loss Calculation (if surprise update) ---
             if update_rule == 'surprise' and use_memory and retrieved_mem is not None:
                  # pred_repr comes from retrieve_memory, should already match buffer dtype
                  pred_repr = retrieved_mem.squeeze(1) # (B, C)
+                 # --- DEBUG PRINTS ---
+                 print(f"\n--- Surprise Update Debug (Step {step}) ---")
+                 print(f"  memory_buffer requires_grad: {self.memory_buffer.requires_grad}")
+                 print(f"  retrieved_mem requires_grad: {retrieved_mem.requires_grad if retrieved_mem is not None else 'N/A'}")
+                 print(f"  pred_repr requires_grad:     {pred_repr.requires_grad if pred_repr is not None else 'N/A'}")
+                 print(f"  target_repr requires_grad:   {target_repr.requires_grad}") # Should be False due to .detach() below
+                 # --- END DEBUG ---
+                 assoc_loss = F.mse_loss(pred_repr, target_repr.detach()) # TARGET IS DETACHED
+                 print(f"  assoc_loss: {assoc_loss.item():.4f}, requires_grad: {assoc_loss.requires_grad}")
                  if self.memory_buffer.grad is not None:
+                      print("  Zeroing existing memory_buffer gradient.")
                       self.memory_buffer.grad.zero_()
+                 if assoc_loss.requires_grad:
+                     print("  Calling assoc_loss.backward()")
+                     assoc_loss.backward() # Compute grads for memory_buffer
+                     print(f"  memory_buffer.grad is None after backward: {self.memory_buffer.grad is None}")
+                     if self.memory_buffer.grad is not None:
+                         print(f"  memory_buffer.grad norm: {torch.norm(self.memory_buffer.grad).item():.4f}")
+                     self.apply_surprise_update() # Apply update and zero grad
+                 else:
+                     print("  ERROR: assoc_loss does not require grad! Skipping backward and update.")
+                 print("--- End Surprise Update Debug ---")
             # --- Standard Generation Logic ---
             # Get logits for the very last position in the output sequence (corresponds to the token we just fed in)
             if eos_token_id is not None and (next_token == eos_token_id).all():
                 break
+        # Restore original requires_grad state
+        self.memory_buffer.requires_grad_(original_requires_grad)
         return generated_ids