will4381 commited on Apr 13, 2025

Commit

3451ca0

verified ·

1 Parent(s): 1f78c84

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

README.md +79 -0
base_model/config.json +26 -0
base_model/model.safetensors +3 -0
code/config.py +75 -0
code/memory.py +187 -0
code/model.py +368 -0
code/modules.py +196 -0
code/test_model.py +536 -0
code/train.py +389 -0
enhanced_config.json +15 -0
gating_mechanism.pth +3 -0
model-index.json +50 -0
qa_head.pth +3 -0
retroactive_layer.pth +3 -0
special_tokens_map.json +7 -0
step_controller.pth +3 -0
tokenizer.json +0 -0
tokenizer_config.json +56 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+# Retroactive Reasoning Network (RRN) for Question Answering
+## Model Description
+This model implements an Enhanced Retroactive Reasoning Network (RRN) for Question Answering tasks. The RRN architecture enables multi-step reasoning through an iterative refinement process that retroactively updates hidden states.
+### Key Features
+- **Multi-step Reasoning**: The model performs 3 reasoning steps to iteratively refine its predictions.
+- **Dynamic Reasoning Steps**: Enabled - Uses a learned approach to determine the number of steps (min: 1, max: 5)
+- **Gating Mechanism**: Selectively applies updates to hidden states.
+- **Delta Magnitude Constraint**: Prevents destabilizing updates with a target ratio of 0.2.
+- **Active Memory**: Stores and retrieves examples to enhance reasoning.
+## Usage
+```python
+from transformers import AutoTokenizer
+from model import EnhancedRRN_QA_Model
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("[MODEL_REPO_ID]")
+model = EnhancedRRN_QA_Model("[MODEL_REPO_ID]/base_model")
+# Load custom components
+import torch
+import os
+model.qa_head.load_state_dict(torch.load(os.path.join("[MODEL_REPO_ID]", "qa_head.pth")))
+model.retroactive_update_layer.load_state_dict(torch.load(os.path.join("[MODEL_REPO_ID]", "retroactive_layer.pth")))
+model.gating_mechanism.load_state_dict(torch.load(os.path.join("[MODEL_REPO_ID]", "gating_mechanism.pth")))
+# If using learned dynamic steps
+if os.path.exists(os.path.join("[MODEL_REPO_ID]", "step_controller.pth")) and hasattr(model, "step_controller"):
+    model.step_controller.load_state_dict(torch.load(os.path.join("[MODEL_REPO_ID]", "step_controller.pth")))
+# Example usage
+inputs = tokenizer("What is the capital of France?", "Paris is the capital of France.", return_tensors="pt")
+outputs = model(**inputs)
+```
+## Training
+This model was trained on the SQuAD dataset using a multi-step reasoning approach. The training code is included in the `code` directory of this repository.
+To train your own model:
+```bash
+python code/train.py
+```
+To evaluate the model:
+```bash
+python code/test_model.py
+```
+## Model Architecture
+The RRN architecture consists of:
+1. A base language model (BERT)
+2. A retroactive update layer that computes delta updates
+3. A gating mechanism for selective updates
+4. An enhanced QA head for answer prediction
+5. A step controller for dynamic reasoning steps (if enabled)
+## Citation
+If you use this model in your research, please cite:
+```
+@article{rrn_qa_model,
+  title={Retroactive Reasoning Networks for Question Answering},
+  author={[Authors]},
+  journal={[Journal]},
+  year={2025}
+}
+```

base_model/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_hidden_states": true,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

base_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0e649b208cf08b8748542d4303a944dfe14f9aeefc6bfe2bed4fca9dbb7c0ba
+size 437951328

code/config.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# config.py (Updated to Disable PEFT)
+import torch
+# --- Model Configuration ---
+# Base model from Hugging Face (ensure it's suitable for QA)
+# Example: 'bert-base-uncased', 'roberta-base', 'bert-large-uncased-whole-word-masking-finetuned-squad'
+BASE_MODEL_NAME = "bert-base-uncased"
+# --- RRN Specific Configuration ---
+# Coherence loss weight
+LAMBDA_COHERENCE = 0.1 # Hyperparameter to tune
+# --- Delta Constraint Configuration ---
+DELTA_TARGET_RATIO = 0.2  # Target ratio of delta norm to h0 norm
+LAMBDA_DELTA_REG = 0.5    # Weight for delta regularization loss
+# --- Multi-step Reasoning Configuration ---
+NUM_REASONING_STEPS = 3   # Default number of reasoning steps (used when dynamic steps disabled)
+# --- Dynamic Reasoning Steps Configuration ---
+USE_DYNAMIC_STEPS = True  # Enable/disable dynamic reasoning steps
+MAX_REASONING_STEPS = 5  # Maximum number of reasoning steps
+MIN_REASONING_STEPS = 1   # Minimum number of reasoning steps
+REASONING_STEP_TYPE = "learned"  # Options: "fixed", "confidence", "learned"
+EARLY_STOP_THRESHOLD = 0.01  # Delta magnitude threshold for early stopping (used with "confidence")
+# --- Mixed Precision Configuration ---
+USE_MIXED_PRECISION = False  # Enable/disable mixed precision training
+# --- Memory Configuration ---
+MEMORY_MAX_SIZE = 50     # Max number of entries in the memory
+MEMORY_USE_DURING_TRAINING = False  # Whether to use memory during training
+MEMORY_RETRIEVAL_K = 3    # Number of examples to retrieve from memory
+# --- PEFT (LoRA) Configuration ---
+USE_PEFT = False # <--- SET TO False TO DISABLE PEFT ---
+# --- Optional: Comment out or leave the LoRA specific settings ---
+# LORA_R = 8
+# LORA_ALPHA = 16
+# LORA_DROPOUT = 0.1
+# LORA_TARGET_MODULES = ["query", "value"]
+# --- Testing Configuration ---
+BYPASS_DELTA_CALCULATION = False  # Set to True to bypass delta calculation for testing
+# --- Training Configuration ---
+# <<< --- Device Detection (CUDA prioritized over MPS) --- >>>
+if torch.cuda.is_available():
+    DEVICE = "cuda"
+    print("CUDA GPU acceleration is available.")
+elif torch.backends.mps.is_available():
+    DEVICE = "mps"
+    print("Apple Silicon MPS acceleration is available.")
+else:
+    DEVICE = "cpu"
+    print("No GPU or MPS acceleration available, using CPU.")
+# <<< --- End of Device Detection --- >>>
+LEARNING_RATE = 1e-5 # Full fine-tuning often uses a smaller LR than PEFT
+EPOCHS = 3
+# --- Adjust Batch Size for Full Fine-tuning ---
+# Full fine-tuning requires significantly more memory
+BATCH_SIZE = 4  # Start smaller, adjust based on your CUDA memory
+GRADIENT_ACCUMULATION_STEPS = 8 # Increase to compensate for smaller batch size
+# --- Dataset Configuration ---
+# Example for SQuAD
+MAX_SEQ_LENGTH = 320 # Max input sequence length for QA
+DOC_STRIDE = 128 # Stride for overlapping chunks for long documents
+print(f"Using device: {DEVICE}")
+print(f"Base model: {BASE_MODEL_NAME}")
+# Update print statement to reflect PEFT status
+print(f"Using PEFT (LoRA): {USE_PEFT} - Full Fine-tuning Enabled")

code/memory.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# memory.py
+from collections import deque
+import torch
+import torch.nn.functional as F
+import config
+class ActiveMemory:
+    """
+    An active memory module that stores and retrieves examples to enhance reasoning.
+    Supports both logging for analysis and retrieval for improved predictions.
+    """
+    def __init__(self, max_size=config.MEMORY_MAX_SIZE, retrieval_k=config.MEMORY_RETRIEVAL_K):
+        self.max_size = max_size
+        self.retrieval_k = retrieval_k
+        self.memory = deque(maxlen=max_size)
+        self.device = config.DEVICE
+        print(f"Initialized ActiveMemory with max size {self.max_size}, retrieval_k={self.retrieval_k}")
+    def add(self, input_data, hidden_states, output, reasoning_trace, final_hidden_states=None, final_output=None):
+        """
+        Adds a new entry to the memory.
+        Args:
+            input_data: The input to the model (tokenized IDs, attention masks, etc.)
+            hidden_states (H0): Initial hidden states from the base model
+            output (y0): Initial prediction from the model
+            reasoning_trace (T): Reasoning trace (all hidden states)
+            final_hidden_states (H1, optional): Final hidden states after retroactive update
+            final_output (y1, optional): Final prediction after retroactive update
+        """
+        # Create a memory entry with detached tensors moved to CPU
+        entry = {
+            'input_ids': input_data.get('input_ids', None).cpu().detach() if input_data.get('input_ids', None) is not None else None,
+            'attention_mask': input_data.get('attention_mask', None).cpu().detach() if input_data.get('attention_mask', None) is not None else None,
+            'token_type_ids': input_data.get('token_type_ids', None).cpu().detach() if input_data.get('token_type_ids', None) is not None else None,
+            'hidden_states': hidden_states.cpu().detach(),
+            'output': {k: v.cpu().detach() for k, v in output.items()} if isinstance(output, dict) else output.cpu().detach(),
+            'reasoning_trace': tuple(h.cpu().detach() for h in reasoning_trace) if isinstance(reasoning_trace, tuple) else reasoning_trace.cpu().detach(),
+        }
+        # Add final states if provided
+        if final_hidden_states is not None:
+            entry['final_hidden_states'] = final_hidden_states.cpu().detach()
+        if final_output is not None:
+            entry['final_output'] = {k: v.cpu().detach() for k, v in final_output.items()} if isinstance(final_output, dict) else final_output.cpu().detach()
+        # Compute and store a summary vector for efficient retrieval
+        # Use mean pooling of hidden states as the summary vector
+        if entry['hidden_states'] is not None and entry['attention_mask'] is not None:
+            # Mean pooling with attention mask
+            mask = entry['attention_mask'].unsqueeze(-1).float()
+            masked_embeddings = entry['hidden_states'] * mask
+            sum_embeddings = torch.sum(masked_embeddings, dim=1)
+            sum_mask = torch.sum(mask, dim=1)
+            sum_mask = torch.clamp(sum_mask, min=1e-9)
+            entry['summary_vector'] = (sum_embeddings / sum_mask).squeeze(0)
+        else:
+            # Fallback to simple mean if attention mask is not available
+            entry['summary_vector'] = entry['hidden_states'].mean(dim=1).squeeze(0)
+        self.memory.append(entry)
+    def retrieve(self, query_hidden_states, query_attention_mask=None, k=None):
+        """
+        Retrieves the k most similar examples from memory based on hidden state similarity.
+        Args:
+            query_hidden_states: Hidden states to compare against memory
+            query_attention_mask: Attention mask for the query
+            k: Number of examples to retrieve (defaults to self.retrieval_k)
+        Returns:
+            List of retrieved memory entries, ordered by similarity (most similar first)
+        """
+        if len(self.memory) == 0:
+            return []
+        if k is None:
+            k = self.retrieval_k
+        k = min(k, len(self.memory))
+        # Compute query summary vector (mean pooling with attention mask)
+        if query_attention_mask is not None:
+            mask = query_attention_mask.unsqueeze(-1).float()
+            masked_embeddings = query_hidden_states * mask
+            sum_embeddings = torch.sum(masked_embeddings, dim=1)
+            sum_mask = torch.sum(mask, dim=1)
+            sum_mask = torch.clamp(sum_mask, min=1e-9)
+            query_vector = (sum_embeddings / sum_mask).squeeze(0)
+        else:
+            query_vector = query_hidden_states.mean(dim=1).squeeze(0)
+        # Move query vector to CPU for comparison with memory
+        query_vector = query_vector.cpu().detach()
+        # Compute similarities with all memory entries
+        similarities = []
+        for i, entry in enumerate(self.memory):
+            memory_vector = entry['summary_vector']
+            # Compute cosine similarity
+            similarity = F.cosine_similarity(query_vector, memory_vector, dim=0)
+            similarities.append((i, similarity.item()))
+        # Sort by similarity (descending) and get top k
+        similarities.sort(key=lambda x: x[1], reverse=True)
+        top_k_indices = [idx for idx, _ in similarities[:k]]
+        # Retrieve the top k entries
+        retrieved_entries = [self.memory[idx] for idx in top_k_indices]
+        # Move retrieved entries to the same device as the query
+        device = query_hidden_states.device
+        for entry in retrieved_entries:
+            # Only move the tensors we'll actually use (hidden_states and final_hidden_states)
+            if 'hidden_states' in entry:
+                entry['hidden_states'] = entry['hidden_states'].to(device)
+            if 'final_hidden_states' in entry:
+                entry['final_hidden_states'] = entry['final_hidden_states'].to(device)
+        return retrieved_entries
+    def get_memory_context(self, query_hidden_states, query_attention_mask=None):
+        """
+        Retrieves and processes memory entries to create a context tensor for the model.
+        Args:
+            query_hidden_states: Hidden states to compare against memory
+            query_attention_mask: Attention mask for the query
+        Returns:
+            memory_context: Tensor of shape (batch_size, seq_len, hidden_dim) containing
+                           processed memory information, or None if memory is empty
+        """
+        # Retrieve similar examples from memory
+        retrieved = self.retrieve(query_hidden_states, query_attention_mask)
+        if not retrieved:
+            return None
+        # Use the device of the query
+        device = query_hidden_states.device
+        batch_size, seq_len, hidden_dim = query_hidden_states.shape
+        # Process retrieved examples to create memory context
+        # Strategy: Average the final hidden states of retrieved examples
+        memory_tensors = []
+        for entry in retrieved:
+            # Prefer final hidden states if available, otherwise use initial hidden states
+            if 'final_hidden_states' in entry and entry['final_hidden_states'] is not None:
+                memory_tensors.append(entry['final_hidden_states'])
+            elif 'hidden_states' in entry:
+                memory_tensors.append(entry['hidden_states'])
+        if not memory_tensors:
+            return None
+        # Average the memory tensors
+        # First ensure all tensors have the same sequence length by padding or truncating
+        padded_tensors = []
+        for tensor in memory_tensors:
+            if tensor.size(1) < seq_len:
+                # Pad
+                padding = torch.zeros(1, seq_len - tensor.size(1), hidden_dim, device=device)
+                padded_tensor = torch.cat([tensor, padding], dim=1)
+                padded_tensors.append(padded_tensor)
+            elif tensor.size(1) > seq_len:
+                # Truncate
+                padded_tensors.append(tensor[:, :seq_len, :])
+            else:
+                padded_tensors.append(tensor)
+        # Stack and average
+        memory_context = torch.stack(padded_tensors).mean(dim=0)
+        # Expand to match batch size if needed
+        if memory_context.size(0) == 1 and batch_size > 1:
+            memory_context = memory_context.expand(batch_size, -1, -1)
+        return memory_context
+    def clear(self):
+        """Clears all entries from memory."""
+        self.memory.clear()
+    def __len__(self):
+        return len(self.memory)

code/model.py ADDED Viewed

	@@ -0,0 +1,368 @@

+# model.py (Enhanced RRN Implementation)
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModelForQuestionAnswering, AutoConfig, AutoModel
+from transformers.modeling_outputs import QuestionAnsweringModelOutput
+import config
+from modules import CrossAttentionDelta, GatingMechanism, EnhancedQAHead
+from memory import ActiveMemory
+class EnhancedRRN_QA_Model(nn.Module):
+    """
+    Enhanced Retroactive Reasoning Network for Question Answering.
+    Improvements:
+    1. Delta magnitude constraint
+    2. Gating mechanism
+    3. Multi-step reasoning
+    4. Active memory usage
+    5. Enhanced QA head
+    6. Improved cross-attention
+    """
+    def __init__(self, model_name=config.BASE_MODEL_NAME):
+        super().__init__()
+        self.model_name = model_name
+        # --- Configuration ---
+        self.num_reasoning_steps = config.NUM_REASONING_STEPS
+        self.delta_target_ratio = config.DELTA_TARGET_RATIO
+        # --- Dynamic Reasoning Steps Configuration ---
+        self.use_dynamic_steps = config.USE_DYNAMIC_STEPS
+        self.max_reasoning_steps = config.MAX_REASONING_STEPS
+        self.min_reasoning_steps = config.MIN_REASONING_STEPS
+        self.reasoning_step_type = config.REASONING_STEP_TYPE
+        self.early_stop_threshold = config.EARLY_STOP_THRESHOLD
+        # --- Load Base Model Configuration ---
+        self.base_config = AutoConfig.from_pretrained(
+            self.model_name,
+            output_hidden_states=True,  # Crucial for Reasoning Trace (T)
+        )
+        self.hidden_dim = self.base_config.hidden_size
+        # Add step controller for learned approach (after hidden_dim is defined)
+        if self.use_dynamic_steps and self.reasoning_step_type == "learned":
+            self.step_controller = nn.Sequential(
+                nn.Linear(self.hidden_dim, 128),
+                nn.ReLU(),
+                nn.Linear(128, self.max_reasoning_steps - self.min_reasoning_steps + 1)
+            )
+            print(f"Using learned dynamic reasoning steps (min={self.min_reasoning_steps}, max={self.max_reasoning_steps})")
+        # --- Load Base Model ---
+        self.base_model = AutoModel.from_pretrained(
+            self.model_name,
+            config=self.base_config
+        )
+        print(f"Loaded base model: {self.model_name}")
+        print(f"Hidden dimension: {self.hidden_dim}")
+        print(f"Using {self.num_reasoning_steps} reasoning steps")
+        # --- Enhanced RRN Components ---
+        # Improved cross-attention delta mechanism
+        self.retroactive_update_layer = CrossAttentionDelta(self.hidden_dim)
+        # Gating mechanism for selective updates
+        self.gating_mechanism = GatingMechanism(self.hidden_dim)
+        # Enhanced QA head with deeper architecture and bilinear scoring
+        self.qa_head = EnhancedQAHead(self.hidden_dim)
+        # --- Active Memory Module ---
+        self.memory = ActiveMemory(
+            max_size=config.MEMORY_MAX_SIZE,
+            retrieval_k=config.MEMORY_RETRIEVAL_K
+        )
+        # --- Loss Functions ---
+        self.coherence_loss_fn = nn.MSELoss()
+        self.delta_reg_loss_fn = nn.MSELoss()
+    def _apply_delta_constraint(self, delta, h0, is_training=False):
+        """
+        Apply delta magnitude constraint to prevent destabilizing updates.
+        Args:
+            delta: The computed delta
+            h0: The initial hidden states
+            is_training: Whether we're in training mode
+        Returns:
+            constrained_delta: The constrained delta
+            delta_reg_loss: Regularization loss for delta magnitude (if training)
+        """
+        # Compute delta and h0 norms
+        delta_norm = delta.norm(dim=-1, keepdim=True)
+        h0_norm = h0.norm(dim=-1, keepdim=True).detach()
+        # Compute ratio
+        ratio = delta_norm / (h0_norm + 1e-9)
+        # Compute regularization loss if in training
+        delta_reg_loss = None
+        if is_training:
+            # Target ratio tensor (same shape as ratio)
+            target_ratio = torch.ones_like(ratio) * self.delta_target_ratio
+            delta_reg_loss = self.delta_reg_loss_fn(ratio, target_ratio)
+        # Apply direct constraint (both during training and inference)
+        # Only scale down deltas that are too large
+        scale_factor = torch.ones_like(ratio)
+        too_large = ratio > self.delta_target_ratio
+        if too_large.any():
+            scale_factor[too_large] = self.delta_target_ratio / ratio[too_large]
+        # Apply scaling
+        constrained_delta = delta * scale_factor
+        return constrained_delta, delta_reg_loss
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        use_memory=True
+    ):
+        return_dict = return_dict if return_dict is not None else self.base_config.use_return_dict
+        is_training = self.training
+        # === 1. Initial Forward Pass ===
+        # Determine if token_type_ids should be passed
+        include_token_type_ids = token_type_ids is not None
+        if include_token_type_ids:
+            outputs = self.base_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                output_hidden_states=True,
+                output_attentions=output_attentions if output_attentions is not None else self.base_config.output_attentions,
+                return_dict=True
+            )
+        else:
+            outputs = self.base_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_hidden_states=True,
+                output_attentions=output_attentions if output_attentions is not None else self.base_config.output_attentions,
+                return_dict=True
+            )
+        # H(0): Last hidden state from the base model
+        h0 = outputs.last_hidden_state
+        # T: Reasoning Trace (all hidden states)
+        reasoning_trace_T = outputs.hidden_states
+        # y^(0): Initial QA prediction using H(0)
+        y0_output = self.qa_head(h0)
+        y0_start_logits, y0_end_logits = y0_output["start_logits"], y0_output["end_logits"]
+        # === 2. Memory Integration (if enabled) ===
+        memory_context = None
+        if use_memory and (is_training and config.MEMORY_USE_DURING_TRAINING or not is_training):
+            if len(self.memory) > 0:
+                memory_context = self.memory.get_memory_context(h0, attention_mask)
+        # === 3. Multi-step Reasoning ===
+        # Initialize current hidden state
+        h_current = h0
+        # Store all deltas and gates for loss calculation and analysis
+        all_deltas = []
+        all_gates = []
+        all_hidden_states = [h0]
+        # Determine number of reasoning steps to use
+        actual_steps_taken = 0
+        if hasattr(self, 'use_dynamic_steps') and self.use_dynamic_steps:
+            if self.reasoning_step_type == "learned":
+                # Pool sequence dimension to get a single vector per example
+                pooled_h0 = h0.mean(dim=1)
+                # Get step logits from controller
+                step_logits = self.step_controller(pooled_h0)
+                if is_training:
+                    # During training, sample from distribution (exploration)
+                    step_probs = F.softmax(step_logits, dim=-1)
+                    steps_idx = torch.multinomial(step_probs, 1).squeeze(-1)
+                    num_steps = steps_idx + self.min_reasoning_steps
+                else:
+                    # During inference, take argmax (exploitation)
+                    steps_idx = torch.argmax(step_logits, dim=-1)
+                    num_steps = steps_idx + self.min_reasoning_steps
+                # Store step logits for analysis
+                step_probs = F.softmax(step_logits, dim=-1)
+                # Get the maximum number of steps across the batch
+                max_num_steps = num_steps.max().item()
+            elif self.reasoning_step_type == "confidence":
+                # For confidence-based, we'll determine dynamically during the loop
+                max_num_steps = self.max_reasoning_steps
+            else:
+                # Fallback to fixed steps
+                max_num_steps = self.num_reasoning_steps
+        else:
+            # Use fixed number of steps
+            max_num_steps = self.num_reasoning_steps
+        # Perform reasoning steps
+        for step in range(max_num_steps):
+            # For confidence-based, check if we should continue for each example
+            if hasattr(self, 'use_dynamic_steps') and self.use_dynamic_steps and self.reasoning_step_type == "confidence" and step >= self.min_reasoning_steps:
+                # Check delta magnitude from previous step
+                if len(all_deltas) > 0:
+                    prev_delta = all_deltas[-1]
+                    delta_norm = prev_delta.norm(dim=-1).mean().item()
+                    if delta_norm < self.early_stop_threshold:
+                        break
+            # For learned approach, check if we've reached the determined number of steps
+            if hasattr(self, 'use_dynamic_steps') and self.use_dynamic_steps and self.reasoning_step_type == "learned":
+                # Create a mask for examples that should continue
+                if step > 0:  # Skip first step check since all examples need at least 1 step
+                    # Check which examples should continue
+                    continue_mask = (step < num_steps).float().unsqueeze(-1).unsqueeze(-1)
+                    # If no examples need more steps, break
+                    if continue_mask.sum() == 0:
+                        break
+            # Compute delta using the current hidden state and reasoning trace
+            if config.BYPASS_DELTA_CALCULATION:
+                # Bypass delta calculation for testing
+                delta = torch.zeros_like(h_current)
+                attn_weights = None
+            else:
+                delta, attn_weights = self.retroactive_update_layer(h_current, reasoning_trace_T)
+            # Apply delta magnitude constraint
+            constrained_delta, delta_reg_loss = self._apply_delta_constraint(delta, h0, is_training)
+            # For learned approach with continue_mask, apply mask to delta
+            if hasattr(self, 'use_dynamic_steps') and self.use_dynamic_steps and self.reasoning_step_type == "learned" and step > 0:
+                constrained_delta = constrained_delta * continue_mask
+            # Compute gate values for selective update
+            gate = self.gating_mechanism(h_current, constrained_delta)
+            # Apply gated update
+            h_current = h_current + gate * constrained_delta
+            # Store for later use
+            all_deltas.append(constrained_delta)
+            all_gates.append(gate)
+            all_hidden_states.append(h_current)
+            actual_steps_taken = step + 1
+        # Final hidden state after all reasoning steps
+        h_final = h_current
+        # === 4. Final Prediction ===
+        y_final_output = self.qa_head(h_final)
+        y_final_start_logits, y_final_end_logits = y_final_output["start_logits"], y_final_output["end_logits"]
+        # === 5. Loss Calculation ===
+        total_loss = None
+        loss_components = {}
+        if start_positions is not None and end_positions is not None:
+            # Prepare ground truth positions
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            ignored_index = y_final_start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            # Task Loss (QA Loss)
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(y_final_start_logits, start_positions)
+            end_loss = loss_fct(y_final_end_logits, end_positions)
+            task_loss = (start_loss + end_loss) / 2
+            loss_components["task_loss"] = task_loss.item()
+            # Coherence Loss
+            coherence_loss_start = self.coherence_loss_fn(y0_start_logits, y_final_start_logits.detach())
+            coherence_loss_end = self.coherence_loss_fn(y0_end_logits, y_final_end_logits.detach())
+            coherence_loss = (coherence_loss_start + coherence_loss_end) / 2
+            loss_components["coherence_loss"] = coherence_loss.item()
+            # Delta Regularization Loss (if computed)
+            if delta_reg_loss is not None:
+                loss_components["delta_reg_loss"] = delta_reg_loss.item()
+            # Total Loss
+            total_loss = task_loss + config.LAMBDA_COHERENCE * coherence_loss
+            # Add delta regularization if computed
+            if delta_reg_loss is not None:
+                total_loss = total_loss + config.LAMBDA_DELTA_REG * delta_reg_loss
+        # === 6. Memory Update ===
+        if use_memory:
+            # Prepare input data
+            input_data = {'input_ids': input_ids, 'attention_mask': attention_mask}
+            if token_type_ids is not None:
+                input_data['token_type_ids'] = token_type_ids
+            # Prepare outputs
+            initial_output = {'start_logits': y0_start_logits, 'end_logits': y0_end_logits}
+            final_output = {'start_logits': y_final_start_logits, 'end_logits': y_final_end_logits}
+            # Add to memory (during both training and inference if enabled)
+            if is_training and config.MEMORY_USE_DURING_TRAINING or not is_training:
+                self.memory.add(
+                    input_data=input_data,
+                    hidden_states=h0,
+                    output=initial_output,
+                    reasoning_trace=reasoning_trace_T,
+                    final_hidden_states=h_final,
+                    final_output=final_output
+                )
+        # === 7. Return Outputs ===
+        if not return_dict:
+            output = (y_final_start_logits, y_final_end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        # Store custom outputs as instance attributes for later access if needed
+        # This avoids passing them to QuestionAnsweringModelOutput which doesn't accept them
+        self.custom_outputs = {
+            "initial_hidden_states": h0,
+            "final_hidden_states": h_final,
+            "all_hidden_states": all_hidden_states,
+            "all_deltas": all_deltas,
+            "all_gates": all_gates,
+            "y0_start_logits": y0_start_logits,
+            "y0_end_logits": y0_end_logits,
+            "loss_components": loss_components if total_loss is not None else None,
+            "steps_taken": actual_steps_taken
+        }
+        # Add step controller outputs if using learned approach
+        if self.use_dynamic_steps and self.reasoning_step_type == "learned":
+            self.custom_outputs["step_probs"] = step_probs
+            self.custom_outputs["num_steps"] = num_steps
+        # Return standard QuestionAnsweringModelOutput without custom fields
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=y_final_start_logits,
+            end_logits=y_final_end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions
+        )

code/modules.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# modules.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import config
+class CrossAttentionDelta(nn.Module):
+    """
+    Enhanced version of CrossAttentionDelta that computes the update delta (Δ) using cross-attention.
+    Improvements:
+    1. Pre-norm architecture (layer norm before attention)
+    2. More sophisticated attention patterns
+    3. Ability to incorporate reasoning trace
+    """
+    def __init__(self, hidden_dim, num_heads=8, dropout=0.1):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        # Pre-norm layer normalization (applied before attention)
+        self.pre_norm = nn.LayerNorm(hidden_dim)
+        # Cross-attention mechanism
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=hidden_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        # Post-attention layer normalization
+        self.post_norm = nn.LayerNorm(hidden_dim)
+        # Trace integration module (to incorporate reasoning trace T)
+        self.trace_integration = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, hidden_dim)
+        )
+        # Enhanced MLP for delta computation
+        self.delta_mlp = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim * 4),  # Larger intermediate expansion
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim * 4, hidden_dim * 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim * 2, hidden_dim)
+        )
+        # Final layer normalization
+        self.final_norm = nn.LayerNorm(hidden_dim)
+    def forward(self, h0, reasoning_trace=None):
+        """
+        Args:
+            h0 (torch.Tensor): Initial hidden states (batch_size, seq_len, hidden_dim).
+            reasoning_trace (tuple of torch.Tensor, optional): Reasoning trace from base model.
+                Each tensor has shape (batch_size, seq_len, hidden_dim).
+        Returns:
+            delta (torch.Tensor): The computed update delta (batch_size, seq_len, hidden_dim).
+        """
+        batch_size, seq_len, _ = h0.shape
+        # --- Pre-norm Architecture ---
+        # Apply layer normalization before attention (pre-norm)
+        h0_norm = self.pre_norm(h0)
+        # --- Enhanced Cross-Attention ---
+        # Get attention weights to visualize attention patterns
+        attn_output, attn_weights = self.cross_attn(
+            query=h0_norm,
+            key=h0_norm,
+            value=h0_norm,
+            need_weights=True
+        )
+        # Residual connection and post-norm
+        c = self.post_norm(h0 + attn_output)
+        # --- Reasoning Trace Integration (if provided) ---
+        if reasoning_trace is not None and len(reasoning_trace) > 0:
+            # Use the last layer from the reasoning trace (most semantic)
+            last_layer = reasoning_trace[-1]
+            # Integrate the reasoning trace with the current context
+            trace_info = self.trace_integration(
+                torch.cat([c, last_layer], dim=-1)
+            )
+            # Add the trace information to the context
+            c = c + trace_info
+        # --- Enhanced MLP for Delta ---
+        # Concatenate original h0 with context c
+        mlp_input = torch.cat((h0, c), dim=-1)
+        # Compute delta through enhanced MLP
+        delta = self.delta_mlp(mlp_input)
+        # Apply final normalization
+        delta = self.final_norm(delta)
+        return delta, attn_weights
+class GatingMechanism(nn.Module):
+    """
+    Gating mechanism to selectively apply updates.
+    Learns when to apply the delta update based on the hidden state and delta.
+    """
+    def __init__(self, hidden_dim, dropout=0.1):
+        super().__init__()
+        self.gate_network = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()  # Output between 0 and 1
+        )
+    def forward(self, h0, delta):
+        """
+        Args:
+            h0 (torch.Tensor): Initial hidden states (batch_size, seq_len, hidden_dim).
+            delta (torch.Tensor): Computed delta (batch_size, seq_len, hidden_dim).
+        Returns:
+            gate (torch.Tensor): Gate values between 0 and 1 (batch_size, seq_len, 1).
+        """
+        # Concatenate h0 and delta
+        gate_input = torch.cat([h0, delta], dim=-1)
+        # Compute gate values
+        gate = self.gate_network(gate_input)
+        return gate
+class EnhancedQAHead(nn.Module):
+    """
+    Enhanced Question Answering head with deeper architecture and bilinear scoring.
+    """
+    def __init__(self, hidden_dim, dropout=0.1):
+        super().__init__()
+        # Deeper representation before prediction
+        self.start_transform = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, hidden_dim)
+        )
+        self.end_transform = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, hidden_dim)
+        )
+        # Bilinear layer for start position scoring
+        self.start_bilinear = nn.Bilinear(hidden_dim, hidden_dim, 1)
+        # Bilinear layer for end position scoring
+        self.end_bilinear = nn.Bilinear(hidden_dim, hidden_dim, 1)
+        # Global representation for bilinear scoring
+        self.global_rep = nn.Parameter(torch.randn(hidden_dim))
+    def forward(self, hidden_states):
+        """
+        Args:
+            hidden_states (torch.Tensor): Hidden states (batch_size, seq_len, hidden_dim).
+        Returns:
+            dict: Dictionary with start_logits and end_logits.
+        """
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+        # Transform hidden states
+        start_rep = self.start_transform(hidden_states)
+        end_rep = self.end_transform(hidden_states)
+        # Expand global representation for batch processing
+        global_rep = self.global_rep.expand(batch_size, seq_len, -1)
+        # Compute start and end logits using bilinear scoring
+        start_logits = self.start_bilinear(start_rep, global_rep).squeeze(-1)
+        end_logits = self.end_bilinear(end_rep, global_rep).squeeze(-1)
+        return {"start_logits": start_logits, "end_logits": end_logits}

code/test_model.py ADDED Viewed

	@@ -0,0 +1,536 @@

+# test_model.py - RRN QA Model evaluation script with multi-step reasoning support
+import torch
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer, AutoModel, default_data_collator
+from datasets import load_dataset
+from tqdm.auto import tqdm
+import os
+import evaluate as hf_evaluate  # Import with alias to avoid naming conflict
+import collections
+import numpy as np
+import logging
+import multiprocessing  # For Windows multiprocessing support
+import json
+import argparse
+import matplotlib.pyplot as plt
+from collections import defaultdict
+# Import custom modules and config
+import config
+from model import EnhancedRRN_QA_Model # Import the enhanced model
+# Make sure memory.py and modules.py are accessible
+# --- Configuration ---
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="Test RRN QA Model")
+    parser.add_argument("--checkpoint", type=str, default="./rrn_qa_model_epoch_3",
+                        help="Path to checkpoint directory (default: ./rrn_qa_model_epoch_3)")
+    parser.add_argument("--batch_size", type=int, default=8,
+                        help="Evaluation batch size (default: 8)")
+    parser.add_argument("--fixed_steps", type=int, default=None,
+                        help="Override to use fixed number of reasoning steps (default: None, use model's dynamic steps)")
+    parser.add_argument("--use_memory", action="store_true",
+                        help="Enable active memory during evaluation")
+    parser.add_argument("--output_dir", type=str, default="./eval_results",
+                        help="Directory to save evaluation results (default: ./eval_results)")
+    parser.add_argument("--visualize", action="store_true",
+                        help="Generate visualizations of reasoning steps")
+    args = parser.parse_args()
+    CHECKPOINT_DIR = args.checkpoint
+    EVAL_BATCH_SIZE = args.batch_size
+    DEVICE = config.DEVICE
+    USE_MEMORY = args.use_memory
+    OUTPUT_DIR = args.output_dir
+    # Create output directory if it doesn't exist
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    logger.info(f"Evaluation configuration:")
+    logger.info(f"  Checkpoint: {CHECKPOINT_DIR}")
+    logger.info(f"  Batch size: {EVAL_BATCH_SIZE}")
+    logger.info(f"  Device: {DEVICE}")
+    logger.info(f"  Use memory: {USE_MEMORY}")
+    logger.info(f"  Output directory: {OUTPUT_DIR}")
+    if args.fixed_steps is not None:
+        logger.info(f"  Using fixed {args.fixed_steps} reasoning steps (overriding model config)")
+    # --- 1. Load Tokenizer and Model from Checkpoint ---
+    logger.info(f"Loading tokenizer from {CHECKPOINT_DIR}...")
+    tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_DIR)
+    logger.info(f"Loading Enhanced RRN QA Model architecture...")
+    # Instantiate the enhanced model architecture
+    model = EnhancedRRN_QA_Model(config.BASE_MODEL_NAME)
+    # Check if we're loading from a checkpoint with the enhanced architecture
+    base_model_path = os.path.join(CHECKPOINT_DIR, "base_model")
+    qa_head_path = os.path.join(CHECKPOINT_DIR, "qa_head.pth")
+    retroactive_layer_path = os.path.join(CHECKPOINT_DIR, "retroactive_layer.pth")
+    gating_mechanism_path = os.path.join(CHECKPOINT_DIR, "gating_mechanism.pth")
+    step_controller_path = os.path.join(CHECKPOINT_DIR, "step_controller.pth")
+    # Check for required components
+    if not os.path.exists(base_model_path):
+        logger.error(f"Base model directory not found at: {base_model_path}")
+        exit()
+    if not os.path.exists(qa_head_path):
+        logger.error(f"QA head weights not found at: {qa_head_path}")
+        exit()
+    if not os.path.exists(retroactive_layer_path):
+        logger.error(f"Retroactive layer weights not found at: {retroactive_layer_path}")
+        exit()
+    # Load base model weights
+    logger.info(f"Loading base model weights from {base_model_path}...")
+    model.base_model = AutoModel.from_pretrained(base_model_path)
+    # Check if we're loading from an enhanced checkpoint or a legacy checkpoint
+    is_enhanced_checkpoint = os.path.exists(gating_mechanism_path)
+    if is_enhanced_checkpoint:
+        # Load all enhanced components
+        logger.info("Loading enhanced model components...")
+        model.qa_head.load_state_dict(torch.load(qa_head_path, map_location='cpu'))
+        model.retroactive_update_layer.load_state_dict(torch.load(retroactive_layer_path, map_location='cpu'))
+        model.gating_mechanism.load_state_dict(torch.load(gating_mechanism_path, map_location='cpu'))
+        # Load step controller if available (for learned dynamic steps)
+        if os.path.exists(step_controller_path) and hasattr(model, "step_controller"):
+            logger.info("Loading step controller for learned dynamic steps...")
+            model.step_controller.load_state_dict(torch.load(step_controller_path, map_location='cpu'))
+        logger.info("Enhanced model loaded successfully.")
+    else:
+        # We're loading from a legacy checkpoint - need to adapt the weights
+        logger.info("Loading from legacy checkpoint - adapting weights to enhanced architecture...")
+        # For the QA head, we need to initialize the enhanced QA head from scratch
+        # since the architectures are different
+        logger.info("Initializing enhanced QA head with random weights...")
+        # For the retroactive layer, we can try to load the weights but might need adjustments
+        logger.warning("Note: The enhanced model uses a different architecture than the checkpoint.")
+        logger.warning("Some components will use random initialization.")
+    # Load enhanced config if available
+    enhanced_config_path = os.path.join(CHECKPOINT_DIR, "enhanced_config.json")
+    if os.path.exists(enhanced_config_path):
+        logger.info(f"Loading enhanced configuration from {enhanced_config_path}")
+        with open(enhanced_config_path, 'r') as f:
+            enhanced_config = json.load(f)
+        # Override model configuration with saved values
+        if "num_reasoning_steps" in enhanced_config:
+            model.num_reasoning_steps = enhanced_config["num_reasoning_steps"]
+            logger.info(f"Using {model.num_reasoning_steps} reasoning steps from config")
+        if "use_dynamic_steps" in enhanced_config:
+            model.use_dynamic_steps = enhanced_config["use_dynamic_steps"]
+            if model.use_dynamic_steps:
+                model.max_reasoning_steps = enhanced_config.get("max_reasoning_steps", config.MAX_REASONING_STEPS)
+                model.min_reasoning_steps = enhanced_config.get("min_reasoning_steps", config.MIN_REASONING_STEPS)
+                model.reasoning_step_type = enhanced_config.get("reasoning_step_type", config.REASONING_STEP_TYPE)
+                model.early_stop_threshold = enhanced_config.get("early_stop_threshold", config.EARLY_STOP_THRESHOLD)
+                logger.info(f"Using dynamic reasoning steps (type: {model.reasoning_step_type})")
+                logger.info(f"Min steps: {model.min_reasoning_steps}, Max steps: {model.max_reasoning_steps}")
+    # Override with fixed steps if specified
+    if args.fixed_steps is not None:
+        logger.info(f"Overriding with fixed {args.fixed_steps} reasoning steps")
+        model.use_dynamic_steps = False
+        model.num_reasoning_steps = args.fixed_steps
+    model.to(DEVICE)
+    model.eval() # Set model to evaluation mode
+    logger.info("Model loaded successfully and set to evaluation mode.")
+    # --- 2. Load and Preprocess Validation Dataset ---
+    logger.info("Loading SQuAD validation dataset...")
+    raw_datasets = load_dataset("squad", split="validation")
+    question_column_name = "question"
+    context_column_name = "context"
+    answer_column_name = "answers"
+    pad_on_right = tokenizer.padding_side == "right"
+    # Validation preprocessing: Keep example_id and offset_mapping
+    def prepare_validation_features(examples):
+        examples[question_column_name] = [q.strip() for q in examples[question_column_name]]
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=config.MAX_SEQ_LENGTH,
+            stride=config.DOC_STRIDE,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length",
+        )
+        # Keep track of which feature belongs to which example
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # Add the example_id to link features to original examples
+        tokenized_examples["example_id"] = []
+        for i in range(len(tokenized_examples["input_ids"])):
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+            # Set offset mapping to None for question tokens to avoid predicting answers there
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+        return tokenized_examples
+    logger.info("Preprocessing validation dataset...")
+    # Disable multiprocessing which can hang on some systems
+    logger.info("Using single process for preprocessing to prevent hanging")
+    eval_dataset = raw_datasets.map(
+        prepare_validation_features,
+        batched=True,
+        remove_columns=raw_datasets.column_names,
+        num_proc=1, # Disable multiprocessing to avoid hanging
+    )
+    # Custom collator to handle None values in offset_mapping
+    def custom_data_collator(features):
+        # First, remove offset_mapping which contains None values that can't be batched
+        offset_mappings = [f.pop("offset_mapping") for f in features]
+        # Use default collator for everything else
+        batch = default_data_collator(features)
+        # Add offset_mapping back as a list since it can't be converted to a tensor
+        batch["offset_mapping"] = offset_mappings
+        return batch
+    # Use custom data collator
+    data_collator = custom_data_collator
+    eval_dataloader = DataLoader(
+        eval_dataset,
+        collate_fn=data_collator,
+        batch_size=EVAL_BATCH_SIZE
+    )
+    # --- 3. Run Inference ---
+    logger.info("***** Running Evaluation *****")
+    logger.info(f"  Num examples = {len(eval_dataset)}")
+    logger.info(f"  Batch size = {EVAL_BATCH_SIZE}")
+    all_start_logits = []
+    all_end_logits = []
+    feature_indices = [] # Keep track of the order
+    # Track multi-step reasoning metrics
+    reasoning_steps_taken = []
+    delta_magnitudes = []
+    gate_values = []
+    initial_vs_final_changes = []
+    with torch.no_grad():
+        for step, batch in enumerate(tqdm(eval_dataloader, desc="Evaluating")):
+            # Move batch to device
+            batch_on_device = {k: v.to(DEVICE) for k, v in batch.items() if isinstance(v, torch.Tensor)}
+            # Store feature indices corresponding to this batch
+            # Assuming 'input_ids' or similar key represents features in order
+            current_indices = list(range(step * EVAL_BATCH_SIZE, step * EVAL_BATCH_SIZE + len(batch_on_device['input_ids'])))
+            feature_indices.extend(current_indices)
+            # Forward pass - pass only inputs needed by model.forward
+            outputs = model(
+                input_ids=batch_on_device.get("input_ids"),
+                attention_mask=batch_on_device.get("attention_mask"),
+                token_type_ids=batch_on_device.get("token_type_ids"),
+                use_memory=USE_MEMORY, # Use memory if enabled
+                return_dict=True
+            )
+            # Get the final logits (y1)
+            start_logits = outputs.start_logits
+            end_logits = outputs.end_logits
+            all_start_logits.append(start_logits.cpu().numpy())
+            all_end_logits.append(end_logits.cpu().numpy())
+            # Collect multi-step reasoning metrics from custom_outputs
+            if hasattr(model, 'custom_outputs'):
+                # Number of reasoning steps taken
+                if 'steps_taken' in model.custom_outputs:
+                    reasoning_steps_taken.append(model.custom_outputs['steps_taken'])
+                # Delta magnitudes (how much the model updates at each step)
+                if 'all_deltas' in model.custom_outputs and len(model.custom_outputs['all_deltas']) > 0:
+                    batch_deltas = []
+                    for delta in model.custom_outputs['all_deltas']:
+                        # Calculate mean delta magnitude across sequence dimension
+                        delta_norm = delta.norm(dim=-1).mean().cpu().item()
+                        batch_deltas.append(delta_norm)
+                    delta_magnitudes.append(batch_deltas)
+                # Gate values (how selective the updates are)
+                if 'all_gates' in model.custom_outputs and len(model.custom_outputs['all_gates']) > 0:
+                    batch_gates = []
+                    for gate in model.custom_outputs['all_gates']:
+                        # Calculate mean gate value across sequence dimension
+                        gate_mean = gate.mean().cpu().item()
+                        batch_gates.append(gate_mean)
+                    gate_values.append(batch_gates)
+                # Compare initial vs final predictions
+                if 'y0_start_logits' in model.custom_outputs and 'y0_end_logits' in model.custom_outputs:
+                    y0_start = model.custom_outputs['y0_start_logits']
+                    y0_end = model.custom_outputs['y0_end_logits']
+                    # Calculate how much the predictions changed
+                    start_change = (start_logits - y0_start).abs().mean().cpu().item()
+                    end_change = (end_logits - y0_end).abs().mean().cpu().item()
+                    initial_vs_final_changes.append((start_change + end_change) / 2)
+    # Concatenate all results
+    all_start_logits = np.concatenate(all_start_logits, axis=0)
+    all_end_logits = np.concatenate(all_end_logits, axis=0)
+    # Ensure the number of predictions matches the number of features
+    if len(all_start_logits) != len(eval_dataset):
+         logger.warning(f"Mismatch in prediction count ({len(all_start_logits)}) and feature count ({len(eval_dataset)}). Check dataloader/inference loop.")
+         # Attempt to slice if predictions exceed features (might happen if last batch wasn't full)
+         all_start_logits = all_start_logits[:len(eval_dataset)]
+         all_end_logits = all_end_logits[:len(eval_dataset)]
+    # Create dictionary mapping feature index to its logits
+    predictions_dict = {
+        feature_index: (start_logit, end_logit)
+        for feature_index, (start_logit, end_logit) in zip(feature_indices, zip(all_start_logits, all_end_logits))
+    }
+    # --- 4. Post-Processing ---
+    # (Adapted from Hugging Face run_qa.py example script)
+    def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=30, tokenizer=tokenizer):
+        all_start_logits, all_end_logits = zip(*raw_predictions.values())
+        # Build a map from example ID to list of related feature indices
+        example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+        features_per_example = collections.defaultdict(list)
+        for i, feature in enumerate(features):
+            features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+        # Dictionary to store predictions
+        predictions = collections.OrderedDict()
+        logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+        # Loop over all examples
+        for example_index, example in enumerate(tqdm(examples, desc="Post-processing")):
+            feature_indices = features_per_example[example_index] # Indices of features related to this example
+            min_null_score = None # Used to identify impossible answers
+            valid_answers = []
+            context = example["context"]
+            # Loop through features associated with the current example
+            for feature_index in feature_indices:
+                start_logits = all_start_logits[feature_index]
+                end_logits = all_end_logits[feature_index]
+                offset_mapping = features[feature_index]["offset_mapping"]
+                # Update minimum null prediction score
+                cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
+                feature_null_score = start_logits[cls_index] + end_logits[cls_index]
+                if min_null_score is None or min_null_score < feature_null_score:
+                    min_null_score = feature_null_score
+                # Go through all possibilities for start/end positions
+                start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
+                end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+                for start_index in start_indexes:
+                    for end_index in end_indexes:
+                        # Skip invalid pairs (start > end, index out of bounds, answer in question part)
+                        if start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or \
+                           offset_mapping[start_index] is None or offset_mapping[end_index] is None or \
+                           end_index < start_index:
+                            continue
+                        # Check answer length
+                        if end_index - start_index + 1 > max_answer_length:
+                            continue
+                        # Extract text and score
+                        start_char = offset_mapping[start_index][0]
+                        end_char = offset_mapping[end_index][1]
+                        score = start_logits[start_index] + end_logits[end_index]
+                        valid_answers.append({
+                            "score": score,
+                            "text": context[start_char: end_char]
+                        })
+            # Select the best answer across all features for this example
+            if len(valid_answers) > 0:
+                best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
+            else:
+                # Fallback for no valid answers found
+                best_answer = {"text": "", "score": min_null_score} # Assign CLS score if needed
+            # Assign final prediction (use empty string if null score is best)
+            # Simple version: always take the best scoring valid answer
+            # More sophisticated versions might compare best_answer["score"] vs min_null_score
+            predictions[example["id"]] = best_answer["text"]
+        return predictions
+    logger.info("Starting post-processing...")
+    final_predictions = postprocess_qa_predictions(raw_datasets, eval_dataset, predictions_dict)
+    # --- 5. Compute Metrics ---
+    logger.info("Calculating SQuAD metrics...")
+    metric = hf_evaluate.load("squad")
+    # Format predictions and references for the metric
+    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
+    formatted_references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in raw_datasets]
+    results = metric.compute(predictions=formatted_predictions, references=formatted_references)
+    logger.info("***** Evaluation Results *****")
+    print(results)
+    # --- 6. Analyze Multi-step Reasoning Metrics ---
+    logger.info("\n***** Multi-step Reasoning Analysis *****")
+    # Calculate average number of reasoning steps
+    if reasoning_steps_taken:
+        avg_steps = sum(reasoning_steps_taken) / len(reasoning_steps_taken)
+        logger.info(f"Average reasoning steps: {avg_steps:.2f}")
+        # Count frequency of each step count
+        step_counts = collections.Counter(reasoning_steps_taken)
+        logger.info(f"Step count distribution: {dict(sorted(step_counts.items()))}")
+    # Calculate average delta magnitudes per step
+    if delta_magnitudes:
+        # Transpose to get step-wise averages
+        steps_delta_magnitudes = defaultdict(list)
+        for batch_deltas in delta_magnitudes:
+            for step_idx, delta in enumerate(batch_deltas):
+                steps_delta_magnitudes[step_idx].append(delta)
+        avg_delta_by_step = {step: sum(deltas)/len(deltas) for step, deltas in steps_delta_magnitudes.items()}
+        logger.info(f"Average delta magnitude by step: {avg_delta_by_step}")
+    # Calculate average gate values per step
+    if gate_values:
+        # Transpose to get step-wise averages
+        steps_gate_values = defaultdict(list)
+        for batch_gates in gate_values:
+            for step_idx, gate in enumerate(batch_gates):
+                steps_gate_values[step_idx].append(gate)
+        avg_gate_by_step = {step: sum(gates)/len(gates) for step, gates in steps_gate_values.items()}
+        logger.info(f"Average gate value by step: {avg_gate_by_step}")
+    # Calculate average change from initial to final predictions
+    if initial_vs_final_changes:
+        avg_change = sum(initial_vs_final_changes) / len(initial_vs_final_changes)
+        logger.info(f"Average change from initial to final predictions: {avg_change:.4f}")
+    # --- 7. Save Results ---
+    results_file = os.path.join(OUTPUT_DIR, "eval_results.json")
+    with open(results_file, 'w') as f:
+        # Combine SQuAD metrics with multi-step reasoning metrics
+        full_results = {
+            "squad_metrics": results,
+            "multi_step_metrics": {
+                "avg_reasoning_steps": avg_steps if reasoning_steps_taken else None,
+                "step_count_distribution": dict(sorted(step_counts.items())) if reasoning_steps_taken else None,
+                "avg_delta_by_step": avg_delta_by_step if delta_magnitudes else None,
+                "avg_gate_by_step": avg_gate_by_step if gate_values else None,
+                "avg_prediction_change": avg_change if initial_vs_final_changes else None
+            }
+        }
+        json.dump(full_results, f, indent=2)
+    logger.info(f"Results saved to {results_file}")
+    # --- 8. Generate Visualizations (if requested) ---
+    if args.visualize and (delta_magnitudes or gate_values or reasoning_steps_taken):
+        logger.info("Generating visualizations...")
+        # Create visualization directory
+        viz_dir = os.path.join(OUTPUT_DIR, "visualizations")
+        os.makedirs(viz_dir, exist_ok=True)
+        # Plot step distribution
+        if reasoning_steps_taken:
+            plt.figure(figsize=(10, 6))
+            plt.bar(step_counts.keys(), step_counts.values())
+            plt.xlabel('Number of Reasoning Steps')
+            plt.ylabel('Frequency')
+            plt.title('Distribution of Reasoning Steps')
+            plt.savefig(os.path.join(viz_dir, 'step_distribution.png'))
+            plt.close()
+        # Plot delta magnitudes by step
+        if delta_magnitudes and steps_delta_magnitudes:
+            plt.figure(figsize=(10, 6))
+            steps = sorted(steps_delta_magnitudes.keys())
+            values = [avg_delta_by_step[step] for step in steps]
+            plt.plot(steps, values, marker='o')
+            plt.xlabel('Reasoning Step')
+            plt.ylabel('Average Delta Magnitude')
+            plt.title('Delta Magnitude by Reasoning Step')
+            plt.grid(True)
+            plt.savefig(os.path.join(viz_dir, 'delta_magnitudes.png'))
+            plt.close()
+        # Plot gate values by step
+        if gate_values and steps_gate_values:
+            plt.figure(figsize=(10, 6))
+            steps = sorted(steps_gate_values.keys())
+            values = [avg_gate_by_step[step] for step in steps]
+            plt.plot(steps, values, marker='o')
+            plt.xlabel('Reasoning Step')
+            plt.ylabel('Average Gate Value')
+            plt.title('Gate Value by Reasoning Step')
+            plt.grid(True)
+            plt.savefig(os.path.join(viz_dir, 'gate_values.png'))
+            plt.close()
+        logger.info(f"Visualizations saved to {viz_dir}")
+if __name__ == "__main__":
+    # This is required for Windows to properly handle multiprocessing
+    multiprocessing.freeze_support()
+    main()
+# Example usage:
+# Test with default settings (epoch 3 checkpoint):
+# python test_model.py
+# Test with specific checkpoint:
+# python test_model.py --checkpoint ./rrn_qa_model_epoch_2
+# Test with fixed number of reasoning steps:
+# python test_model.py --fixed_steps 3
+# Test with active memory:
+# python test_model.py --use_memory
+# Test with visualizations:
+# python test_model.py --visualize

code/train.py ADDED Viewed

	@@ -0,0 +1,389 @@

+# train.py (Updated for Full Fine-tuning)
+import torch
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from torch.amp import autocast, GradScaler  # For mixed precision training (updated import)
+from transformers import AutoTokenizer, default_data_collator
+from datasets import load_dataset
+from tqdm.auto import tqdm # Progress bar
+import os
+import evaluate # For metrics
+import logging # Optional: Better logging
+import multiprocessing # For Windows multiprocessing support
+import argparse # For command line arguments
+# Import our custom modules and config
+import config
+from model import EnhancedRRN_QA_Model
+# Setup basic logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="Train RRN QA Model")
+    parser.add_argument("--checkpoint", type=str, help="Path to checkpoint directory to resume from")
+    parser.add_argument("--start_epoch", type=int, default=0, help="Epoch to start training from")
+    parser.add_argument(
+        "--subset_percentage",
+        type=float,
+        default=100.0,
+        help="Percentage of training data to use (1.0-100.0). Default: 100.0 (full dataset)"
+    )
+    parser.add_argument(
+        "--bypass_delta",
+        action="store_true",
+        help="Bypass RRN delta calculation (sets delta = torch.zeros_like(h0))"
+    )
+    args = parser.parse_args()
+    # Set bypass delta calculation flag if specified
+    if args.bypass_delta:
+        logger.info("BYPASS_DELTA_CALCULATION enabled: Setting delta = torch.zeros_like(h0)")
+        config.BYPASS_DELTA_CALCULATION = True
+    else:
+        config.BYPASS_DELTA_CALCULATION = False
+    # --- 1. Load Tokenizer and Model ---
+    if args.checkpoint:
+        logger.info(f"Loading tokenizer from checkpoint: {args.checkpoint}")
+        tokenizer = AutoTokenizer.from_pretrained(args.checkpoint)
+        logger.info(f"Loading model from checkpoint: {args.checkpoint}")
+        # Initialize the model with base architecture
+        model = EnhancedRRN_QA_Model(os.path.join(args.checkpoint, "base_model"))
+        # Check for enhanced model components
+        gating_mechanism_path = os.path.join(args.checkpoint, "gating_mechanism.pth")
+        is_enhanced_checkpoint = os.path.exists(gating_mechanism_path)
+        # Load custom module weights
+        logger.info("Loading model components...")
+        model.qa_head.load_state_dict(torch.load(os.path.join(args.checkpoint, "qa_head.pth")))
+        model.retroactive_update_layer.load_state_dict(torch.load(os.path.join(args.checkpoint, "retroactive_layer.pth")))
+        # Load gating mechanism if available
+        if is_enhanced_checkpoint:
+            logger.info("Loading gating mechanism...")
+            model.gating_mechanism.load_state_dict(torch.load(gating_mechanism_path))
+        # Load step controller if available (for learned dynamic steps)
+        step_controller_path = os.path.join(args.checkpoint, "step_controller.pth")
+        if os.path.exists(step_controller_path) and hasattr(model, "step_controller"):
+            logger.info("Loading step controller for learned dynamic steps...")
+            model.step_controller.load_state_dict(torch.load(step_controller_path))
+    else:
+        logger.info("Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(config.BASE_MODEL_NAME)
+        logger.info("Instantiating Enhanced RRN QA Model for Full Fine-tuning...")
+        model = EnhancedRRN_QA_Model(config.BASE_MODEL_NAME)
+    model.to(config.DEVICE)
+    # --- 2. Load and Preprocess Dataset ---
+    logger.info("Loading SQuAD dataset...")
+    raw_datasets = load_dataset("squad")
+    # Handle dataset subsetting
+    subset_percentage = args.subset_percentage
+    if subset_percentage < 100.0:
+        original_train_size = len(raw_datasets["train"])
+        # Calculate subset size and validate
+        subset_percentage = max(0.1, min(100.0, subset_percentage))  # Clamp between 0.1% and 100%
+        train_subset_size = int(original_train_size * subset_percentage / 100)
+        train_subset_size = max(100, min(original_train_size, train_subset_size))  # Ensure reasonable bounds
+        # Create reproducible subset with fixed seed for consistency
+        subset_indices = torch.randperm(original_train_size, generator=torch.Generator().manual_seed(42))[:train_subset_size].tolist()
+        raw_datasets["train"] = raw_datasets["train"].select(subset_indices)
+        logger.info(f"Using {subset_percentage:.1f}% of training data ({train_subset_size}/{original_train_size} examples)")
+    else:
+        logger.info(f"Using full training dataset ({len(raw_datasets['train'])} examples)")
+    question_column_name = "question"
+    context_column_name = "context"
+    answer_column_name = "answers"
+    pad_on_right = tokenizer.padding_side == "right"
+    def prepare_train_features(examples):
+        examples[question_column_name] = [q.strip() for q in examples[question_column_name]]
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=config.MAX_SEQ_LENGTH,
+            stride=config.DOC_STRIDE,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length",
+        )
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+        for i, offsets in enumerate(offset_mapping):
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            sample_index = sample_mapping[i]
+            answers = examples[answer_column_name][sample_index]
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+        return tokenized_examples
+    logger.info("Preprocessing datasets...")
+    # Use single process on Windows to avoid multiprocessing issues
+    tokenized_datasets = raw_datasets.map(
+        prepare_train_features,
+        batched=True,
+        remove_columns=raw_datasets["train"].column_names,
+        num_proc=1 # Use single process to avoid Windows multiprocessing issues
+    )
+    data_collator = default_data_collator
+    train_dataloader = DataLoader(
+        tokenized_datasets["train"],
+        shuffle=True,
+        collate_fn=data_collator,
+        batch_size=config.BATCH_SIZE
+    )
+    # Consider adding validation dataloader setup here as well
+    # eval_dataloader = DataLoader(...)
+    # --- 3. Setup Optimizer ---
+    logger.info("Setting up optimizer for FULL model fine-tuning...")
+    # Optimize all parameters since PEFT is disabled
+    optimizer = optim.AdamW(model.parameters(), lr=config.LEARNING_RATE)
+    logger.info(f"Optimizer: AdamW with LR={config.LEARNING_RATE}")
+    # Calculate total steps considering gradient accumulation
+    num_update_steps_per_epoch = len(train_dataloader) // config.GRADIENT_ACCUMULATION_STEPS
+    num_training_steps = config.EPOCHS * num_update_steps_per_epoch
+    logger.info(f"Total optimization steps: {num_training_steps}")
+    # --- 4. Initialize Mixed Precision Training ---
+    # Initialize gradient scaler for mixed precision training
+    scaler = GradScaler('cuda', enabled=config.USE_MIXED_PRECISION)  # Updated to fix deprecation warning
+    # Log mixed precision and dynamic steps status
+    if config.USE_MIXED_PRECISION:
+        logger.info("Mixed precision training (FP16) enabled")
+    if config.USE_DYNAMIC_STEPS:
+        logger.info(f"Dynamic reasoning steps enabled (type: {config.REASONING_STEP_TYPE})")
+        logger.info(f"Min steps: {config.MIN_REASONING_STEPS}, Max steps: {config.MAX_REASONING_STEPS}")
+    # Log bypass delta calculation status
+    if config.BYPASS_DELTA_CALCULATION:
+        logger.info("BYPASS_DELTA_CALCULATION enabled: Delta calculation is bypassed (delta = torch.zeros_like(h0))")
+    # --- 5. Training Loop ---
+    logger.info("***** Starting Training *****")
+    logger.info(f"  Num examples = {len(tokenized_datasets['train'])}")
+    logger.info(f"  Num Epochs = {config.EPOCHS}")
+    logger.info(f"  Instantaneous batch size per device = {config.BATCH_SIZE}")
+    logger.info(f"  Gradient Accumulation steps = {config.GRADIENT_ACCUMULATION_STEPS}")
+    logger.info(f"  Total optimization steps = {num_training_steps}")
+    # Add note about subset training if applicable
+    if subset_percentage < 100.0:
+        logger.info(f"  NOTE: Training on {subset_percentage:.1f}% of data - metrics may not represent full dataset performance")
+    model.train() # Set model to training mode
+    global_step = 0
+    total_loss = 0.0 # Use float for accumulated loss
+    # Start from specified epoch (default is 0 if not provided)
+    start_epoch = args.start_epoch
+    for epoch in range(start_epoch, config.EPOCHS):
+        logger.info(f"\n--- Starting Epoch {epoch+1}/{config.EPOCHS} ---")
+        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}", unit="batch")
+        for step, batch in enumerate(progress_bar):
+            # Move batch to device
+            # Ensure only tensors are moved, handle potential non-tensor data if any
+            batch_on_device = {}
+            for k, v in batch.items():
+                if isinstance(v, torch.Tensor):
+                    batch_on_device[k] = v.to(config.DEVICE)
+                # else: # Handle or skip non-tensor items if necessary
+                #     batch_on_device[k] = v
+            try:
+                # Forward pass with autocast for mixed precision
+                with autocast('cuda', enabled=config.USE_MIXED_PRECISION):  # Updated to fix deprecation warning
+                    outputs = model(
+                        input_ids=batch_on_device.get("input_ids"),
+                        attention_mask=batch_on_device.get("attention_mask"),
+                        token_type_ids=batch_on_device.get("token_type_ids"),
+                        start_positions=batch_on_device.get("start_positions"),
+                        end_positions=batch_on_device.get("end_positions"),
+                        use_memory=False # Disable memory during training steps
+                    )
+                    loss = outputs.loss
+                    if loss is None:
+                        logger.warning(f"Step {step}: Loss is None. Skipping batch.")
+                        continue
+                    # Scale loss for gradient accumulation
+                    loss = loss / config.GRADIENT_ACCUMULATION_STEPS
+                # Accumulate loss value for logging (before backward)
+                total_loss += loss.item()
+                # Scale loss and perform backward pass with AMP
+                scaler.scale(loss).backward()
+            except Exception as e:
+                logger.error(f"Error during forward/backward pass at step {step}: {e}")
+                # Optional: Add more detailed error handling or debugging info
+                # logger.error(f"Batch keys: {batch.keys()}")
+                # logger.error(f"Input IDs shape: {batch_on_device.get('input_ids').shape if batch_on_device.get('input_ids') is not None else 'None'}")
+                raise e # Re-raise the exception to stop training
+            # Optimizer step (perform step only after accumulating gradients)
+            if (step + 1) % config.GRADIENT_ACCUMULATION_STEPS == 0 or step == len(train_dataloader) - 1:
+                # Unscale before optimizer step (to check for infs/NaNs)
+                scaler.unscale_(optimizer)
+                # Clip gradients to avoid explosion (optional but recommended with mixed precision)
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                # Step with scaler
+                scaler.step(optimizer)
+                scaler.update()
+                optimizer.zero_grad() # Reset gradients for the next accumulation cycle
+                global_step += 1
+                # Log progress periodically
+                if global_step % 50 == 0: # Log every 50 optimization steps
+                    avg_loss = total_loss / 50 # Average loss over the last 50 steps
+                    logger.info(f"Step: {global_step}, Avg Loss: {avg_loss:.4f}")
+                    total_loss = 0.0 # Reset loss accumulator
+                # Update progress bar description with current step loss and steps info
+                postfix = {
+                    "Loss": f"{loss.item()*config.GRADIENT_ACCUMULATION_STEPS:.4f}",
+                    "Step": global_step
+                }
+                # Add steps info if using dynamic steps
+                if config.USE_DYNAMIC_STEPS and hasattr(model, 'custom_outputs'):
+                    if 'steps_taken' in model.custom_outputs:
+                        postfix["Steps"] = model.custom_outputs['steps_taken']
+                progress_bar.set_postfix(postfix)
+        # --- (Optional) Evaluation at the end of each epoch ---
+        # logger.info(f"\n--- Evaluating after Epoch {epoch+1} ---")
+        # model.eval()
+        # # Add evaluation loop here (requires validation dataloader, postprocessing, metrics)
+        # model.train() # Set back to train mode
+        # --- Save Model Checkpoint ---
+        output_dir = f"./rrn_qa_model_epoch_{epoch+1}"
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"--- Saving model checkpoint to {output_dir} ---")
+        # --- Saving Logic for Enhanced Model ---
+        try:
+            logger.info(f"Saving enhanced model components to {output_dir}")
+            # Save base model using its save_pretrained
+            model.base_model.save_pretrained(os.path.join(output_dir, "base_model"))
+            # Save all custom modules' state dicts
+            torch.save(model.qa_head.state_dict(), os.path.join(output_dir, "qa_head.pth"))
+            torch.save(model.retroactive_update_layer.state_dict(), os.path.join(output_dir, "retroactive_layer.pth"))
+            torch.save(model.gating_mechanism.state_dict(), os.path.join(output_dir, "gating_mechanism.pth"))
+            # Save step controller if using learned dynamic steps
+            if config.USE_DYNAMIC_STEPS and config.REASONING_STEP_TYPE == "learned" and hasattr(model, "step_controller"):
+                torch.save(model.step_controller.state_dict(), os.path.join(output_dir, "step_controller.pth"))
+                logger.info("Saved step controller for learned dynamic steps")
+            # Save tokenizer
+            tokenizer.save_pretrained(output_dir)
+            # Save configuration
+            with open(os.path.join(output_dir, "enhanced_config.json"), "w") as f:
+                import json
+                config_dict = {
+                    "num_reasoning_steps": config.NUM_REASONING_STEPS,
+                    "delta_target_ratio": config.DELTA_TARGET_RATIO,
+                    "lambda_coherence": config.LAMBDA_COHERENCE,
+                    "lambda_delta_reg": config.LAMBDA_DELTA_REG,
+                    "memory_max_size": config.MEMORY_MAX_SIZE,
+                    "memory_retrieval_k": config.MEMORY_RETRIEVAL_K,
+                    "use_mixed_precision": config.USE_MIXED_PRECISION,
+                    "bypass_delta_calculation": config.BYPASS_DELTA_CALCULATION
+                }
+                # Add dynamic steps configuration if enabled
+                if config.USE_DYNAMIC_STEPS:
+                    config_dict.update({
+                        "use_dynamic_steps": config.USE_DYNAMIC_STEPS,
+                        "max_reasoning_steps": config.MAX_REASONING_STEPS,
+                        "min_reasoning_steps": config.MIN_REASONING_STEPS,
+                        "reasoning_step_type": config.REASONING_STEP_TYPE,
+                        "early_stop_threshold": config.EARLY_STOP_THRESHOLD
+                    })
+                json.dump(config_dict, f, indent=2)
+            logger.info("Enhanced model checkpoint saved successfully.")
+        except Exception as e:
+            logger.error(f"Error saving checkpoint at epoch {epoch+1}: {e}")
+    logger.info("\n***** Training finished *****")
+if __name__ == "__main__":
+    # This is required for Windows to properly handle multiprocessing
+    multiprocessing.freeze_support()
+    main()
+# Example usage:
+# Train on full dataset (default):
+# python train.py
+# Train on 10% of data for faster iterations:
+# python train.py --subset_percentage 10.0
+# Train on 1% for very quick testing:
+# python train.py --subset_percentage 1.0
+# Resume training from checkpoint with subset:
+# python train.py --checkpoint ./rrn_qa_model_epoch_1 --start_epoch 1 --subset_percentage 25.0
+# Test with bypassed delta calculation (sets delta = torch.zeros_like(h0)):
+# python train.py --bypass_delta --subset_percentage 1.0

enhanced_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "num_reasoning_steps": 3,
+  "delta_target_ratio": 0.2,
+  "lambda_coherence": 0.1,
+  "lambda_delta_reg": 0.5,
+  "memory_max_size": 50,
+  "memory_retrieval_k": 3,
+  "use_mixed_precision": false,
+  "bypass_delta_calculation": false,
+  "use_dynamic_steps": true,
+  "max_reasoning_steps": 5,
+  "min_reasoning_steps": 1,
+  "reasoning_step_type": "learned",
+  "early_stop_threshold": 0.01
+}

gating_mechanism.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a1b4605a1297d5d738dda9e6b739764e6412e27053f75df64ebe99f5f49188c
+size 7090146

model-index.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "name": "Retroactive Reasoning Network for Question Answering",
+  "language": "en",
+  "license": "apache-2.0",
+  "task_categories": [
+    "question-answering"
+  ],
+  "tags": [
+    "question-answering",
+    "multi-step-reasoning",
+    "retroactive-reasoning",
+    "squad"
+  ],
+  "datasets": [
+    "squad"
+  ],
+  "metrics": [
+    "exact_match",
+    "f1"
+  ],
+  "model-index": [
+    {
+      "name": "RRN QA Model",
+      "results": [
+        {
+          "task": {
+            "type": "question-answering",
+            "name": "Question Answering"
+          },
+          "dataset": {
+            "name": "SQuAD",
+            "type": "squad"
+          },
+          "metrics": [
+            {
+              "type": "exact_match",
+              "value": "TBD",
+              "name": "Exact Match"
+            },
+            {
+              "type": "f1",
+              "value": "TBD",
+              "name": "F1"
+            }
+          ]
+        }
+      ]
+    }
+  ]
+}

qa_head.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70288115fb5d3176e59d873418d1d1e7fd4dc9c3d6911f8edeee93354227b82b
+size 14175663

retroactive_layer.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1faa8764e567c3b5575c5aff1b8c4c3ed7486935730923c692dae3d4122cefbd
+size 59048138

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

step_controller.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cae0d7b2423071c8e36f0d07965a540193fb739638351115bfa8045b9844aac7
+size 398480

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff