Enhance benchmark and Cortex modules with new training utilities and improved state management. Update README with example output for Llama-3.2-1B and add training CLI for Cortex module tuning. Refactor scoring functions to reset Cortex state between examples and ensure consistent output. Modify task handling to ensure proper formatting of input data.

Browse files

Files changed (13) hide show

README.md +34 -12
benchmark/run_benchmark.py +5 -0
benchmark/runner.py +6 -0
benchmark/scoring.py +11 -0
benchmark/tasks.py +9 -3
benchmark/train_cortex.py +161 -0
benchmark/tuning.py +97 -0
cortex/adaptive_depth.py +21 -9
cortex/backtrack_head.py +6 -1
cortex/core.py +11 -3
cortex/hallucination_gate.py +18 -5
cortex/memory_bank.py +5 -1
test_cortex.py +2 -2

README.md CHANGED Viewed

@@ -181,28 +181,30 @@ python -m benchmark.run_benchmark --n 10 --model meta-llama/Llama-3.2-1B --tasks
 - **Multiple-choice tasks:** Log-likelihood scoring — computes average log-probability the model assigns to each continuation, picks the highest. This is the standard approach used by lm-evaluation-harness and Open LLM Leaderboard.
 - **Generation tasks:** Greedy decode + substring match against expected answer.
-### Example Output (SmolLM2-135M, n=20)
 ```
 ======================================================================
-BENCHMARK SUMMARY: HuggingFaceTB/SmolLM2-135M
-n=20 per task, device=cuda
 ======================================================================
 Task                       Base   Cortex    Delta
 --------------------------------------------------
-hellaswag                0.3500   0.5000  +0.1500 ↑
-piqa                     0.5000   0.5000  +0.0000
-arc-easy                 0.2500   0.4500  +0.2000 ↑
-winogrande               0.6500   0.6500  +0.0000
-passkey                  1.0000   0.8889  -0.1111 ↓
-multi_hop                0.6250   0.2500  -0.3750 ↓
-Cortex overhead: 4,296,134 params (3.19%)
 ======================================================================
 ```
-> **Note:** Cortex modules are untrained at injection (zero-initialized gates). The slight degradation on generation tasks (passkey, multi-hop) is expected — these require module training to improve. Standard log-likelihood tasks remain stable because zero-init gates are nearly transparent.
 ### Programmatic Usage
@@ -328,6 +330,26 @@ surgeon.modules["steering"].set_direction("truthfulness", direction, alpha=10.0)
 ## Training
 ```python
 import torch.optim as optim

 - **Multiple-choice tasks:** Log-likelihood scoring — computes average log-probability the model assigns to each continuation, picks the highest. This is the standard approach used by lm-evaluation-harness and Open LLM Leaderboard.
 - **Generation tasks:** Greedy decode + substring match against expected answer.
+### Example Output (Llama-3.2-1B, n=10)
 ```
 ======================================================================
+BENCHMARK SUMMARY: meta-llama/Llama-3.2-1B
+n=10 per task, device=mps
 ======================================================================
 Task                       Base   Cortex    Delta
 --------------------------------------------------
+hellaswag                0.6000   0.6000  +0.0000
+piqa                     0.2000   0.2000  +0.0000
+arc-easy                 0.4000   0.4000  +0.0000
+arc-challenge            0.5000   0.5000  +0.0000
+winogrande               0.6000   0.6000  +0.0000
+mmlu                     0.4000   0.4000  +0.0000
+passkey                  1.0000   1.0000  +0.0000
+multi_hop                1.0000   1.0000  +0.0000
+Cortex overhead: 53,708,968 params (4.35%)
 ======================================================================
 ```
+> **Note:** Cortex modules are untrained at injection and initialize as exact no-ops for model behavior. Freshly injected modules should match the base model; positive deltas require Cortex-specific training or calibrated steering directions.
 ### Programmatic Usage
 ## Training
+For benchmark-style supervised tuning, use the training CLI. It freezes the base
+model, injects Cortex modules, optimizes only Cortex parameters, and saves the
+adapter weights:
+```bash
+python -m benchmark.train_cortex \
+  --model meta-llama/Llama-3.2-1B \
+  --tasks hellaswag piqa arc-easy winogrande \
+  --n-train 32 \
+  --epochs 1 \
+  --output cortex_tuned.pt
+python -m benchmark.run_benchmark \
+  --model meta-llama/Llama-3.2-1B \
+  --cortex-weights cortex_tuned.pt \
+  --n 50
+```
+For custom training loops:
 ```python
 import torch.optim as optim

benchmark/run_benchmark.py CHANGED Viewed

@@ -68,6 +68,10 @@ def main():
         "--output", type=str, default=None,
         help="Path to save JSON results",
     )
     args = parser.parse_args()
@@ -77,6 +81,7 @@ def main():
         model_name=args.model,
         device=args.device,
         dtype=args.dtype,
     )
     n = args.n if args.n > 0 else None

         "--output", type=str, default=None,
         help="Path to save JSON results",
     )
+    parser.add_argument(
+        "--cortex-weights", type=str, default=None,
+        help="Optional Cortex weights file to load before the Cortex phase",
+    )
     args = parser.parse_args()
         model_name=args.model,
         device=args.device,
         dtype=args.dtype,
+        cortex_weights=args.cortex_weights,
     )
     n = args.n if args.n > 0 else None

benchmark/runner.py CHANGED Viewed

@@ -40,8 +40,10 @@ class BenchmarkRunner:
         model_name: str = "HuggingFaceTB/SmolLM2-135M",
         device: str = "auto",
         dtype: str = "float32",
     ):
         self.model_name = model_name
         if device == "auto":
             self.device = resolve_torch_device("auto")
@@ -167,6 +169,10 @@ class BenchmarkRunner:
         ))
         surgeon.operate(freeze_base=True)
         report = surgeon.get_parameter_report()
         total_cortex = sum(info["trainable"] for info in report.values())

         model_name: str = "HuggingFaceTB/SmolLM2-135M",
         device: str = "auto",
         dtype: str = "float32",
+        cortex_weights: Optional[str] = None,
     ):
         self.model_name = model_name
+        self.cortex_weights = cortex_weights
         if device == "auto":
             self.device = resolve_torch_device("auto")
         ))
         surgeon.operate(freeze_base=True)
+        if self.cortex_weights:
+            surgeon.load_cortex_modules(self.cortex_weights)
+            print(f"  Loaded Cortex weights: {self.cortex_weights}")
         report = surgeon.get_parameter_report()
         total_cortex = sum(info["trainable"] for info in report.values())

benchmark/scoring.py CHANGED Viewed

@@ -17,6 +17,15 @@ import re
 from cortex.torch_device import resolve_torch_device
 @torch.no_grad()
 def log_likelihood_score(
     model,
@@ -63,6 +72,7 @@ def log_likelihood_score(
         # Forward pass
         input_ids = torch.tensor([full_ids], device=device)
         # Truncate if too long for model
         max_len = getattr(model.config, "max_position_embeddings", 2048)
@@ -121,6 +131,7 @@ def generate_and_check(
     if device is None:
         device = resolve_torch_device("auto")
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
     # Pad token
     pad_token_id = tokenizer.pad_token_id

 from cortex.torch_device import resolve_torch_device
+def reset_cortex_state(model, batch_size: int = 1):
+    """Reset runtime state for injected Cortex modules between independent examples."""
+    surgeon = getattr(model, "_cortex_surgeon", None)
+    if surgeon is None:
+        return
+    for module in surgeon.modules.values():
+        module.reset_state(batch_size=batch_size)
 @torch.no_grad()
 def log_likelihood_score(
     model,
         # Forward pass
         input_ids = torch.tensor([full_ids], device=device)
+        reset_cortex_state(model, batch_size=input_ids.shape[0])
         # Truncate if too long for model
         max_len = getattr(model.config, "max_position_embeddings", 2048)
     if device is None:
         device = resolve_torch_device("auto")
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
+    reset_cortex_state(model, batch_size=inputs["input_ids"].shape[0])
     # Pad token
     pad_token_id = tokenizer.pad_token_id

benchmark/tasks.py CHANGED Viewed

@@ -75,7 +75,10 @@ class HellaSwag(BenchmarkTask):
         train_examples = []
         for row in few_shot_ds:
             ctx = row["ctx"]
-            endings = row["endings"]
             gold = int(row["label"])
             train_examples.append({
                 "context": ctx,
@@ -85,7 +88,10 @@ class HellaSwag(BenchmarkTask):
         for row in ds:
             ctx = row["ctx"]
-            endings = row["endings"]
             gold = int(row["label"])
             examples.append({
                 "context": ctx,
@@ -132,7 +138,7 @@ class ARC(BenchmarkTask):
             choice_str = " ".join(f"{l}) {t}" for l, t in zip(labels, texts))
             context = f"Question: {question}\n{choice_str}\nAnswer:"
-            continuations = [f" {t}" for t in texts]
             return {
                 "context": context,

         train_examples = []
         for row in few_shot_ds:
             ctx = row["ctx"]
+            endings = [
+                ending if ending.startswith(" ") else f" {ending}"
+                for ending in row["endings"]
+            ]
             gold = int(row["label"])
             train_examples.append({
                 "context": ctx,
         for row in ds:
             ctx = row["ctx"]
+            endings = [
+                ending if ending.startswith(" ") else f" {ending}"
+                for ending in row["endings"]
+            ]
             gold = int(row["label"])
             examples.append({
                 "context": ctx,
             choice_str = " ".join(f"{l}) {t}" for l, t in zip(labels, texts))
             context = f"Question: {question}\n{choice_str}\nAnswer:"
+            continuations = [f" {l}" for l in labels]
             return {
                 "context": context,

benchmark/train_cortex.py ADDED Viewed

	@@ -0,0 +1,161 @@

+#!/usr/bin/env python3
+"""
+Supervised Cortex adapter tuning.
+This trains only Cortex module parameters against the same multiple-choice
+log-likelihood objective used by the benchmark runner. It is intended as a
+small, explicit tuning step before expecting Cortex to outperform the base
+model.
+"""
+import argparse
+import os
+import random
+import sys
+import time
+import torch
+# Ensure parent directory is on path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from benchmark.runner import BenchmarkRunner
+from benchmark.tasks import TASK_REGISTRY
+from benchmark.tuning import cortex_auxiliary_loss, multiple_choice_loss
+def load_examples(task_names, n_per_task, seed):
+    examples = []
+    for task_name in task_names:
+        task_cls = TASK_REGISTRY[task_name]
+        task = task_cls() if callable(task_cls) else task_cls
+        task_examples = task.load_examples(n=n_per_task, seed=seed)
+        examples.extend((task_name, ex) for ex in task_examples)
+        print(f"Loaded {len(task_examples)} examples for {task_name}")
+    return examples
+def main():
+    parser = argparse.ArgumentParser(description="Train Cortex modules on benchmark-style MC data")
+    parser.add_argument(
+        "--model", type=str, default="HuggingFaceTB/SmolLM2-135M",
+        help="HuggingFace model ID to tune",
+    )
+    parser.add_argument(
+        "--tasks", nargs="+", default=["hellaswag", "piqa", "arc-easy", "winogrande"],
+        help="Tasks to train on",
+    )
+    parser.add_argument(
+        "--n-train", type=int, default=8,
+        help="Examples per task for tuning",
+    )
+    parser.add_argument("--epochs", type=int, default=1)
+    parser.add_argument("--lr", type=float, default=1e-4)
+    parser.add_argument("--weight-decay", type=float, default=0.01)
+    parser.add_argument("--max-grad-norm", type=float, default=1.0)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument(
+        "--device", type=str, default="auto",
+        help="Device: cuda, mps, cpu, or auto",
+    )
+    parser.add_argument(
+        "--dtype", type=str, default="float32",
+        choices=["float32", "float16", "bfloat16"],
+    )
+    parser.add_argument(
+        "--init-cortex-weights", type=str, default=None,
+        help="Optional Cortex weights to resume from",
+    )
+    parser.add_argument(
+        "--output", type=str, default="cortex_tuned.pt",
+        help="Path to save tuned Cortex weights",
+    )
+    parser.add_argument("--log-every", type=int, default=4)
+    args = parser.parse_args()
+    random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    runner = BenchmarkRunner(
+        model_name=args.model,
+        device=args.device,
+        dtype=args.dtype,
+        cortex_weights=args.init_cortex_weights,
+    )
+    runner.inject_cortex()
+    model = runner.model
+    tokenizer = runner.tokenizer
+    surgeon = runner._surgeon
+    model.train()
+    examples = load_examples(args.tasks, args.n_train, args.seed)
+    if not examples:
+        raise RuntimeError("No training examples loaded")
+    trainable_params = list(surgeon.get_trainable_parameters())
+    optimizer = torch.optim.AdamW(
+        trainable_params,
+        lr=args.lr,
+        weight_decay=args.weight_decay,
+    )
+    print(f"Training on {len(examples)} examples for {args.epochs} epoch(s)")
+    start = time.time()
+    for epoch in range(args.epochs):
+        rng = random.Random(args.seed + epoch)
+        rng.shuffle(examples)
+        total_loss = 0.0
+        correct = 0
+        seen = 0
+        skipped = 0
+        for step, (task_name, example) in enumerate(examples, start=1):
+            optimizer.zero_grad(set_to_none=True)
+            loss, pred = multiple_choice_loss(model, tokenizer, example, runner.device)
+            if loss is None:
+                skipped += 1
+                continue
+            aux_loss = cortex_auxiliary_loss(model)
+            train_loss = loss + aux_loss
+            train_loss.backward()
+            if args.max_grad_norm > 0:
+                torch.nn.utils.clip_grad_norm_(trainable_params, args.max_grad_norm)
+            optimizer.step()
+            seen += 1
+            total_loss += float(train_loss.detach().cpu())
+            correct += int(pred == example["gold_idx"])
+            if step % args.log_every == 0 or step == len(examples):
+                avg_loss = total_loss / max(seen, 1)
+                acc = correct / max(seen, 1)
+                print(
+                    f"epoch={epoch + 1} step={step}/{len(examples)} "
+                    f"task={task_name} loss={avg_loss:.4f} acc={acc:.3f}"
+                )
+        avg_loss = total_loss / max(seen, 1)
+        acc = correct / max(seen, 1)
+        print(
+            f"Epoch {epoch + 1} done: loss={avg_loss:.4f} "
+            f"acc={acc:.3f} skipped={skipped}"
+        )
+    output_dir = os.path.dirname(args.output)
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+    surgeon.save_cortex_modules(args.output)
+    elapsed = time.time() - start
+    print(f"Saved Cortex weights to {args.output} [{elapsed:.1f}s]")
+if __name__ == "__main__":
+    main()

benchmark/tuning.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+Training utilities for supervised Cortex adapter tuning.
+These helpers keep the base model frozen and optimize only the modules managed by
+CortexSurgeon. They intentionally mirror benchmark log-likelihood scoring so a
+small tuning run optimizes the same multiple-choice objective being evaluated.
+"""
+from __future__ import annotations
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from benchmark.scoring import reset_cortex_state
+def continuation_log_likelihood(
+    model,
+    tokenizer,
+    context: str,
+    continuation: str,
+    device: str,
+) -> Optional[torch.Tensor]:
+    """Differentiable average continuation log-likelihood."""
+    ctx_ids = tokenizer.encode(context, add_special_tokens=False)
+    full_ids = tokenizer.encode(context + continuation, add_special_tokens=False)
+    cont_start = len(ctx_ids)
+    cont_length = len(full_ids) - cont_start
+    if cont_start <= 0 or cont_length <= 0:
+        return None
+    input_ids = torch.tensor([full_ids], device=device)
+    max_len = getattr(model.config, "max_position_embeddings", 2048)
+    if input_ids.shape[1] > max_len:
+        input_ids = input_ids[:, :max_len]
+        cont_length = min(cont_length, max_len - cont_start)
+        if cont_length <= 0:
+            return None
+    reset_cortex_state(model, batch_size=input_ids.shape[0])
+    outputs = model(input_ids)
+    logits = outputs.logits
+    shift_logits = logits[0, cont_start - 1 : cont_start + cont_length - 1, :]
+    shift_labels = input_ids[0, cont_start : cont_start + cont_length]
+    log_probs = F.log_softmax(shift_logits, dim=-1)
+    token_log_probs = log_probs.gather(1, shift_labels.unsqueeze(1)).squeeze(1)
+    return token_log_probs.mean()
+def multiple_choice_loss(
+    model,
+    tokenizer,
+    example: Dict,
+    device: str,
+) -> Tuple[Optional[torch.Tensor], Optional[int]]:
+    """
+    Cross-entropy over continuation log-likelihoods.
+    Returns:
+        (loss, prediction). If an example cannot be scored, both are None.
+    """
+    scores: List[torch.Tensor] = []
+    for continuation in example["continuations"]:
+        score = continuation_log_likelihood(
+            model, tokenizer, example["context"], continuation, device
+        )
+        if score is None:
+            return None, None
+        scores.append(score)
+    logits = torch.stack(scores).unsqueeze(0)
+    gold = torch.tensor([example["gold_idx"]], device=device)
+    loss = F.cross_entropy(logits, gold)
+    pred = int(logits.argmax(dim=-1).item())
+    return loss, pred
+def cortex_auxiliary_loss(model) -> torch.Tensor:
+    """Collect differentiable auxiliary losses exposed by Cortex modules."""
+    device = next(model.parameters()).device
+    surgeon = getattr(model, "_cortex_surgeon", None)
+    if surgeon is None:
+        return torch.tensor(0.0, device=device)
+    losses = []
+    for module in surgeon.modules.values():
+        get_budget_loss = getattr(module, "get_budget_loss", None)
+        if get_budget_loss is not None:
+            losses.append(get_budget_loss())
+    if not losses:
+        return torch.tensor(0.0, device=device)
+    return torch.stack([loss.to(device) for loss in losses]).sum()

cortex/adaptive_depth.py CHANGED Viewed

@@ -77,6 +77,10 @@ class AdaptiveDepth(CortexModule):
         # Initialize gate to be "open" (execute layer) by default
         nn.init.constant_(self.gate_net[-1].bias, 2.0)  # sigmoid(2) ≈ 0.88
         # Buffers for monitoring
         self.register_buffer("_pre_layer_hidden", None, persistent=False)
@@ -91,6 +95,7 @@ class AdaptiveDepth(CortexModule):
         self,
         hidden_states: torch.Tensor,
         layer_idx: int,
         **kwargs
     ) -> torch.Tensor:
         """
@@ -103,22 +108,29 @@ class AdaptiveDepth(CortexModule):
         """
         # Compute gate value per token
         gate_logit = self.gate_net(hidden_states) / self.temperature  # [B, T, 1]
-        gate = torch.sigmoid(gate_logit)
         # Straight-through estimator for hard gating
         if self.gate_type == "straight_through" and self.training:
-            hard_gate = (gate > 0.5).float()
-            gate = hard_gate - gate.detach() + gate  # STE
-        self._gate_values = gate.detach()
-        # Gate the output: scale by gate, preserve gradients
-        gated_output = gate * hidden_states + (1 - gate) * hidden_states.detach()
         # Budget regularization loss
-        avg_gate = gate.mean()
         budget_loss = self.budget_loss_weight * (avg_gate - self.target_budget).pow(2)
-        self._budget_loss = budget_loss.detach()
         return gated_output
@@ -139,4 +151,4 @@ class AdaptiveDepth(CortexModule):
     def extra_repr(self):
         return (f"hidden_dim={self.hidden_dim}, target_budget={self.target_budget}, "
-                f"gate_type={self.gate_type}, {super().extra_repr()}")

         # Initialize gate to be "open" (execute layer) by default
         nn.init.constant_(self.gate_net[-1].bias, 2.0)  # sigmoid(2) ≈ 0.88
+        # Blend from identity toward the learned depth gate during training.
+        # Initial value gives an exact no-op, preserving pretrained behavior.
+        self.gate_residual_scale = nn.Parameter(torch.tensor(0.0))
         # Buffers for monitoring
         self.register_buffer("_pre_layer_hidden", None, persistent=False)
         self,
         hidden_states: torch.Tensor,
         layer_idx: int,
+        pre_layer_hidden: Optional[torch.Tensor] = None,
         **kwargs
     ) -> torch.Tensor:
         """
         """
         # Compute gate value per token
         gate_logit = self.gate_net(hidden_states) / self.temperature  # [B, T, 1]
+        learned_gate = torch.sigmoid(gate_logit)
         # Straight-through estimator for hard gating
         if self.gate_type == "straight_through" and self.training:
+            hard_gate = (learned_gate > 0.5).float()
+            learned_gate = hard_gate - learned_gate.detach() + learned_gate  # STE
+        # At initialization, gate_residual_scale = 0 and effective_gate = 1, so
+        # the injected module preserves the original model exactly.
+        effective_gate = 1.0 + self.gate_residual_scale * (learned_gate - 1.0)
+        self._gate_values = effective_gate.detach()
+        if pre_layer_hidden is not None and pre_layer_hidden.shape == hidden_states.shape:
+            residual_update = hidden_states - pre_layer_hidden
+            gated_output = pre_layer_hidden + effective_gate * residual_update
+        else:
+            gated_output = hidden_states
         # Budget regularization loss
+        avg_gate = learned_gate.mean()
         budget_loss = self.budget_loss_weight * (avg_gate - self.target_budget).pow(2)
+        self._budget_loss = budget_loss
         return gated_output
     def extra_repr(self):
         return (f"hidden_dim={self.hidden_dim}, target_budget={self.target_budget}, "
+                f"gate_type={self.gate_type}, {super().extra_repr()}")

cortex/backtrack_head.py CHANGED Viewed

@@ -153,6 +153,11 @@ class BacktrackHead(CortexModule):
     def get_confidence_history(self) -> torch.Tensor:
         """Return the confidence scores across all layers from the last forward pass."""
         return self._confidence_history.clone()
     def was_triggered(self) -> bool:
         """Whether backtracking was triggered in the last forward pass."""
@@ -160,4 +165,4 @@ class BacktrackHead(CortexModule):
     def extra_repr(self):
         return (f"hidden_dim={self.hidden_dim}, drop_threshold={self.drop_threshold}, "
-                f"{super().extra_repr()}")

     def get_confidence_history(self) -> torch.Tensor:
         """Return the confidence scores across all layers from the last forward pass."""
         return self._confidence_history.clone()
+    def reset_state(self, batch_size: int = 1):
+        """Clear confidence history between independent examples."""
+        self._confidence_history.zero_()
+        self._last_triggered = torch.tensor(False, device=self._last_triggered.device)
     def was_triggered(self) -> bool:
         """Whether backtracking was triggered in the last forward pass."""
     def extra_repr(self):
         return (f"hidden_dim={self.hidden_dim}, drop_threshold={self.drop_threshold}, "
+                f"{super().extra_repr()}")

cortex/core.py CHANGED Viewed

@@ -85,6 +85,10 @@ class CortexModule(ABC, nn.Module):
         for hook in self._hooks:
             hook.remove()
         self._hooks.clear()
     def enable(self):
         self._active = True
@@ -352,6 +356,7 @@ class CortexSurgeon:
             logger.info(f"Injected '{name}' into layers {target_layer_idxs}")
         self._operated = True
         total_params = sum(p.numel() for p in self.model.parameters())
         cortex_params = sum(
@@ -371,11 +376,12 @@ class CortexSurgeon:
             def post_ffn_hook(mod, inp, output, _module=module, _layer_idx=layer_idx):
                 if not _module.is_active:
                     return output
                 if isinstance(output, tuple):
                     hidden = output[0]
-                    hidden = _module(hidden, layer_idx=_layer_idx)
                     return (hidden,) + output[1:]
-                return _module(output, layer_idx=_layer_idx)
             return layer.register_forward_hook(post_ffn_hook)
         elif point == InjectionPoint.PRE_ATTENTION:
@@ -455,6 +461,8 @@ class CortexSurgeon:
         self.modules.clear()
         self._operated = False
         logger.info("All Cortex modules removed, model restored")
     def get_trainable_parameters(self):
@@ -490,4 +498,4 @@ class CortexSurgeon:
         for name, module in self.modules.items():
             if name in state:
                 module.load_state_dict(state[name]["state_dict"])
-                logger.info(f"Loaded weights for '{name}'")

         for hook in self._hooks:
             hook.remove()
         self._hooks.clear()
+    def reset_state(self, batch_size: int = 1):
+        """Reset per-example runtime state, if the module keeps any."""
+        pass
     def enable(self):
         self._active = True
             logger.info(f"Injected '{name}' into layers {target_layer_idxs}")
         self._operated = True
+        setattr(self.model, "_cortex_surgeon", self)
         total_params = sum(p.numel() for p in self.model.parameters())
         cortex_params = sum(
             def post_ffn_hook(mod, inp, output, _module=module, _layer_idx=layer_idx):
                 if not _module.is_active:
                     return output
+                pre_hidden = inp[0] if isinstance(inp, tuple) and len(inp) > 0 else None
                 if isinstance(output, tuple):
                     hidden = output[0]
+                    hidden = _module(hidden, layer_idx=_layer_idx, pre_layer_hidden=pre_hidden)
                     return (hidden,) + output[1:]
+                return _module(output, layer_idx=_layer_idx, pre_layer_hidden=pre_hidden)
             return layer.register_forward_hook(post_ffn_hook)
         elif point == InjectionPoint.PRE_ATTENTION:
         self.modules.clear()
         self._operated = False
+        if getattr(self.model, "_cortex_surgeon", None) is self:
+            delattr(self.model, "_cortex_surgeon")
         logger.info("All Cortex modules removed, model restored")
     def get_trainable_parameters(self):
         for name, module in self.modules.items():
             if name in state:
                 module.load_state_dict(state[name]["state_dict"])
+                logger.info(f"Loaded weights for '{name}'")

cortex/hallucination_gate.py CHANGED Viewed

@@ -102,6 +102,10 @@ class HallucinationGate(CortexModule):
         # Learnable gate bias per dimension — allows the model to learn which
         # dimensions are safe to suppress vs which should always pass through
         self.dim_gate = nn.Parameter(torch.zeros(1, 1, hidden_dim))
         # Running confidence for monitoring/logging
         self.register_buffer("_last_confidence", torch.tensor(0.5), persistent=False)
@@ -111,6 +115,7 @@ class HallucinationGate(CortexModule):
         self,
         hidden_states: torch.Tensor,
         layer_idx: int,
         **kwargs
     ) -> torch.Tensor:
         """
@@ -131,13 +136,21 @@ class HallucinationGate(CortexModule):
         # Compute per-dimension gate
         dim_bias = torch.sigmoid(self.dim_gate)  # [1, 1, D], in (0,1)
-        # Effective gate: combines token-level confidence with dimension-level bias
         # High dim_bias = dimension always passes through
         # Low dim_bias = dimension is gated by confidence
-        gate = 1.0 - self.suppression_strength * (1.0 - confidence) * (1.0 - dim_bias)
-        # Apply gate
-        gated_output = hidden_states * gate
         return gated_output
@@ -148,4 +161,4 @@ class HallucinationGate(CortexModule):
     def extra_repr(self):
         return (f"hidden_dim={self.hidden_dim}, "
                 f"suppression_strength={self.suppression_strength}, "
-                f"{super().extra_repr()}")

         # Learnable gate bias per dimension — allows the model to learn which
         # dimensions are safe to suppress vs which should always pass through
         self.dim_gate = nn.Parameter(torch.zeros(1, 1, hidden_dim))
+        # Start as an exact no-op. Once this scalar moves away from zero during
+        # training, the confidence probe and dimension gate control suppression.
+        self.suppression_scale = nn.Parameter(torch.tensor(0.0))
         # Running confidence for monitoring/logging
         self.register_buffer("_last_confidence", torch.tensor(0.5), persistent=False)
         self,
         hidden_states: torch.Tensor,
         layer_idx: int,
+        pre_layer_hidden: Optional[torch.Tensor] = None,
         **kwargs
     ) -> torch.Tensor:
         """
         # Compute per-dimension gate
         dim_bias = torch.sigmoid(self.dim_gate)  # [1, 1, D], in (0,1)
+        # Effective gate: combines token-level confidence with dimension-level bias.
+        # suppression_scale is zero at initialization, so this module is exactly
+        # transparent before Cortex-specific training.
         # High dim_bias = dimension always passes through
         # Low dim_bias = dimension is gated by confidence
+        effective_suppression = self.suppression_strength * self.suppression_scale
+        gate = 1.0 - effective_suppression * (1.0 - confidence) * (1.0 - dim_bias)
+        # Apply the gate to the layer update when the hook can provide the block
+        # input. Fallback to gating the stream itself for manually-called modules.
+        if pre_layer_hidden is not None and pre_layer_hidden.shape == hidden_states.shape:
+            residual_update = hidden_states - pre_layer_hidden
+            gated_output = pre_layer_hidden + gate * residual_update
+        else:
+            gated_output = hidden_states * gate
         return gated_output
     def extra_repr(self):
         return (f"hidden_dim={self.hidden_dim}, "
                 f"suppression_strength={self.suppression_strength}, "
+                f"{super().extra_repr()}")

cortex/memory_bank.py CHANGED Viewed

@@ -96,6 +96,10 @@ class MemoryBank(CortexModule):
     def reset_memory(self, batch_size: int = 1):
         """Reset memory to initial state."""
         self._memory_state = self.memory_init.expand(batch_size, -1, -1).clone()
     def forward(
         self,
@@ -167,4 +171,4 @@ class MemoryBank(CortexModule):
     def extra_repr(self):
         return (f"hidden_dim={self.hidden_dim}, num_slots={self.num_slots}, "
-                f"num_heads={self.num_heads}, {super().extra_repr()}")

     def reset_memory(self, batch_size: int = 1):
         """Reset memory to initial state."""
         self._memory_state = self.memory_init.expand(batch_size, -1, -1).clone()
+    def reset_state(self, batch_size: int = 1):
+        """Reset memory between independent benchmark examples."""
+        self.reset_memory(batch_size=batch_size)
     def forward(
         self,
     def extra_repr(self):
         return (f"hidden_dim={self.hidden_dim}, num_slots={self.num_slots}, "
+                f"num_heads={self.num_heads}, {super().extra_repr()}")

test_cortex.py CHANGED Viewed

@@ -4,7 +4,7 @@ Verify that:
 1. All modules inject without errors
 2. Forward pass works
 3. Gradients flow only through Cortex parameters
-4. Output changes when modules are enabled/disabled
 5. Each module's specific functionality works
 Usage:
@@ -114,4 +114,4 @@ def main():
     print(f"{'='*60}")
 if __name__ == "__main__":
-    main()

 1. All modules inject without errors
 2. Forward pass works
 3. Gradients flow only through Cortex parameters
+4. Freshly injected modules preserve base outputs
 5. Each module's specific functionality works
 Usage:
     print(f"{'='*60}")
 if __name__ == "__main__":
+    main()