Upload folder using huggingface_hub

0e849e2 verified about 2 months ago

33.4 kB

	"""
	backbone.py — Eyla V2 Custom Hybrid Backbone
	===============================================
	Llama-3.2-1B compatible architecture with custom zero-cost extensions.

	Architecture:
	- 24 transformer layers (Llama-compatible for weight transplant)
	- Grouped Query Attention (32 heads, 8 KV heads)
	- RoPE (Rotary Position Embedding)
	- RMSNorm + SiLU-gated MLP
	- SSM side-cars at layers 4, 8, 12, 16, 20 (HiPPO init)
	- Heuristic surprise gates (no learned params)
	- Heuristic early exit (confidence-based)
	- Heuristic complexity estimator (entropy-based)

	Zero-cost design:
	- Donor weights transplanted into all 24 layers → works on day 1
	- SSM side-cars start as no-ops (gate=0) → no interference
	- Heuristic gates need no training
	- Online learning gradually activates SSM contribution

	Naming convention matches LlamaForCausalLM for weight transplant:
	- token_embedding ← model.embed_tokens
	- layers.{i}.* ← model.layers.{i}.*
	- final_norm ← model.norm
	- lm_head ← lm_head
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from typing import Optional, Dict, Any, List, Tuple
	import math
	import logging

	from .ssm_block import SSMBlock
	from .heuristic_gates import HeuristicGates

	logger = logging.getLogger(__name__)

	# ── Default config matching Llama 3.2 1B ────────────────────────────────────

	EYLA_V2_CONFIG = {
	"hidden_size": 2048,
	"intermediate_size": 8192,
	"num_attention_heads": 32,
	"num_key_value_heads": 8,
	"num_layers": 24,
	"vocab_size": 128256,
	"rms_norm_eps": 1e-5,
	"rope_theta": 500000.0,
	"rope_scaling": {
	"factor": 32.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3",
	},
	"max_position_embeddings": 131072,
	"tie_word_embeddings": True,
	# Eyla custom — SSM side-cars every 4 layers (BUILD_PLAN spec)
	"ssm_layers": [4, 8, 12, 16, 20],
	"ssm_state_dim": 64,
	"ssm_dt": 0.01,
	"side_car_init_std": 1e-5,
	"early_exit_confidence": 0.9,
	"early_exit_min_layers": 8,
	"surprise_threshold": 4.0,
	}


	# ── Building blocks ─────────────────────────────────────────────────────────

	class RMSNorm(nn.Module):
	"""Root Mean Square Layer Normalization (matches LlamaRMSNorm)."""

	def __init__(self, hidden_size: int, eps: float = 1e-5):
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.eps = eps

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	norm = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
	return x * norm * self.weight


	class RotaryEmbedding(nn.Module):
	"""Rotary Position Embedding (RoPE) — matches Llama 3 implementation with rope_scaling."""

	def __init__(self, dim: int, theta: float = 500000.0, rope_scaling: Optional[Dict] = None):
	super().__init__()
	inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

	# Apply Llama 3 rope scaling if configured
	if rope_scaling is not None and rope_scaling.get("rope_type") == "llama3":
	inv_freq = self._apply_llama3_scaling(inv_freq, rope_scaling)

	self.register_buffer("inv_freq", inv_freq, persistent=False)
	self._max_cached = 0
	self._cos_cached = None
	self._sin_cached = None

	@staticmethod
	def _apply_llama3_scaling(inv_freq: torch.Tensor, scaling: Dict) -> torch.Tensor:
	"""Apply Llama 3 frequency scaling (matches HF transformers)."""
	factor = scaling["factor"]
	low_freq_factor = scaling.get("low_freq_factor", 1.0)
	high_freq_factor = scaling.get("high_freq_factor", 4.0)
	old_context_len = scaling.get("original_max_position_embeddings", 8192)

	low_freq_wavelen = old_context_len / low_freq_factor
	high_freq_wavelen = old_context_len / high_freq_factor

	new_freqs = []
	for freq in inv_freq:
	wavelen = 2 * math.pi / freq.item()
	if wavelen < high_freq_wavelen:
	new_freqs.append(freq.item())
	elif wavelen > low_freq_wavelen:
	new_freqs.append(freq.item() / factor)
	else:
	smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
	new_freqs.append((1 - smooth) * freq.item() / factor + smooth * freq.item())

	return torch.tensor(new_freqs, dtype=inv_freq.dtype)

	def _build_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype):
	if seq_len <= self._max_cached and self._cos_cached is not None:
	return
	self._max_cached = max(seq_len, 2048)
	t = torch.arange(self._max_cached, device=device, dtype=torch.float32)
	freqs = torch.outer(t, self.inv_freq.to(device))
	emb = torch.cat([freqs, freqs], dim=-1) # (seq, dim)
	self._cos_cached = emb.cos().to(dtype)
	self._sin_cached = emb.sin().to(dtype)

	def forward(self, x: torch.Tensor, position_ids: torch.Tensor):
	"""
	Args:
	x: (B, n_heads, S, head_dim)
	position_ids: (B, S) or (1, S)
	Returns:
	cos, sin: (1, 1, S, head_dim) for broadcasting
	"""
	seq_len = position_ids.max().item() + 1
	self._build_cache(seq_len, x.device, x.dtype)
	# Gather by position
	cos = self._cos_cached[position_ids].unsqueeze(1) # (B, 1, S, dim)
	sin = self._sin_cached[position_ids].unsqueeze(1)
	return cos, sin


	def rotate_half(x: torch.Tensor) -> torch.Tensor:
	"""Rotate half the hidden dims of the input for RoPE."""
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat([-x2, x1], dim=-1)


	def apply_rotary_pos_emb(q, k, cos, sin):
	"""Apply rotary position embeddings to query and key tensors."""
	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed


	# ── Attention ────────────────────────────────────────────────────────────────

	class Attention(nn.Module):
	"""
	Grouped Query Attention (GQA) — matches LlamaAttention.

	32 query heads, 8 KV heads (4:1 ratio).
	"""

	def __init__(self, config: Dict[str, Any]):
	super().__init__()
	self.hidden_size = config["hidden_size"]
	self.num_heads = config["num_attention_heads"]
	self.num_kv_heads = config["num_key_value_heads"]
	self.head_dim = self.hidden_size // self.num_heads
	self.num_kv_groups = self.num_heads // self.num_kv_heads

	self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
	self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
	self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
	self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

	self.rotary_emb = RotaryEmbedding(
	self.head_dim,
	theta=config.get("rope_theta", 500000.0),
	rope_scaling=config.get("rope_scaling"),
	)

	def forward(
	self,
	hidden_states: torch.Tensor,
	position_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
	use_cache: bool = False,
	) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
	B, S, _ = hidden_states.shape

	# Project Q, K, V
	q = self.q_proj(hidden_states).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
	k = self.k_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
	v = self.v_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)

	# Apply RoPE
	cos, sin = self.rotary_emb(q, position_ids)
	q, k = apply_rotary_pos_emb(q, k, cos, sin)

	# KV cache: concatenate with past keys/values
	if past_key_value is not None:
	k = torch.cat([past_key_value[0], k], dim=2)
	v = torch.cat([past_key_value[1], v], dim=2)

	new_kv = (k, v) if use_cache else None

	# Repeat KV heads for GQA
	k_expanded = k.repeat_interleave(self.num_kv_groups, dim=1) if self.num_kv_groups > 1 else k
	v_expanded = v.repeat_interleave(self.num_kv_groups, dim=1) if self.num_kv_groups > 1 else v

	# Scaled dot-product attention
	KV_LEN = k_expanded.shape[2]
	scale = 1.0 / math.sqrt(self.head_dim)
	attn_weights = torch.matmul(q, k_expanded.transpose(-2, -1)) * scale

	# Causal mask (Q_len x KV_len)
	causal_mask = torch.triu(
	torch.full((S, KV_LEN), float("-inf"), device=hidden_states.device, dtype=hidden_states.dtype),
	diagonal=KV_LEN - S + 1,
	)
	attn_weights = attn_weights + causal_mask.unsqueeze(0).unsqueeze(0)

	# Padding mask
	if attention_mask is not None:
	pad_mask = (1.0 - attention_mask.unsqueeze(1).unsqueeze(2).float()) * float("-inf")
	attn_weights = attn_weights + pad_mask

	attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
	attn_output = torch.matmul(attn_weights, v_expanded)

	# Merge heads
	attn_output = attn_output.transpose(1, 2).contiguous().view(B, S, self.hidden_size)
	return self.o_proj(attn_output), new_kv


	# ── MLP ──────────────────────────────────────────────────────────────────────

	class MLP(nn.Module):
	"""SiLU-gated MLP — matches LlamaMLP."""

	def __init__(self, config: Dict[str, Any]):
	super().__init__()
	self.gate_proj = nn.Linear(config["hidden_size"], config["intermediate_size"], bias=False)
	self.up_proj = nn.Linear(config["hidden_size"], config["intermediate_size"], bias=False)
	self.down_proj = nn.Linear(config["intermediate_size"], config["hidden_size"], bias=False)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))


	# ── Transformer Layer ────────────────────────────────────────────────────────

	class TransformerLayer(nn.Module):
	"""
	Single transformer layer — matches LlamaDecoderLayer naming.

	Sub-module names must match for weight transplant:
	self_attn.q_proj, self_attn.k_proj, self_attn.v_proj, self_attn.o_proj
	mlp.gate_proj, mlp.up_proj, mlp.down_proj
	input_layernorm, post_attention_layernorm

	Layers 16-23 (duplicated from donor 8-15) have a learnable layer_gate
	that starts at 0.0 so they act as pass-through on day 1. This prevents
	the duplicated layers from breaking the hidden state distribution.
	Online learning gradually opens the gate.
	"""

	def __init__(self, config: Dict[str, Any], layer_idx: int):
	super().__init__()
	self.layer_idx = layer_idx
	num_layers = config.get("num_layers", 24)
	donor_layers = config.get("donor_layers", 16)

	# Standard Llama components
	self.self_attn = Attention(config)
	self.mlp = MLP(config)
	self.input_layernorm = RMSNorm(config["hidden_size"], config.get("rms_norm_eps", 1e-5))
	self.post_attention_layernorm = RMSNorm(config["hidden_size"], config.get("rms_norm_eps", 1e-5))

	# Deep init scaling (GPT-2 style) — prevents NaN with random weights
	# These weights will be overwritten by donor transplant anyway
	init_scale = 1.0 / math.sqrt(2 * num_layers)
	nn.init.normal_(self.self_attn.o_proj.weight, std=0.02 * init_scale)
	nn.init.normal_(self.mlp.down_proj.weight, std=0.02 * init_scale)

	# Duplicate layer gate: layers >= donor_layers start as pass-through (gate=0).
	# On day 1: output = input + gate * layer_output = input (since gate=0)
	# Through online learning: gate opens, layer contributes.
	self.is_duplicate = layer_idx >= donor_layers
	if self.is_duplicate:
	self.layer_gate = nn.Parameter(torch.tensor(0.0))

	# ── Brain Region Labels ─────────────────────────────────────────
	# PFC subdivision labels (layers 16-23 map to prefrontal cortex regions)
	_pfc_regions = {
	16: "dlPFC (Working Memory)",
	17: "dlPFC (Working Memory)",
	18: "vmPFC (Value/Emotion)",
	19: "vmPFC (Value/Emotion)",
	20: "OFC (Outcome Prediction)",
	21: "vlPFC (Response Inhibition)",
	22: "vlPFC (Response Inhibition)",
	23: "Anterior PFC (Metacognition)",
	}
	self.pfc_region = _pfc_regions.get(layer_idx, None)

	# SSM brain region labels (5 side-cars = 5 brain regions)
	_ssm_brain_regions = {
	4: "Secondary Sensory Cortex",
	8: "Superior Temporal Sulcus",
	12: "Temporal-Parietal Junction",
	16: "Dorsolateral PFC",
	20: "Anterior PFC / Frontal Pole",
	}

	# SSM side-car (only at specific layers)
	self.has_ssm = layer_idx in config.get("ssm_layers", [])
	if self.has_ssm:
	self.ssm = SSMBlock(
	d_model=config["hidden_size"],
	state_dim=config.get("ssm_state_dim", 64),
	dt=config.get("ssm_dt", 0.01),
	init_std=config.get("side_car_init_std", 1e-5),
	)
	self.ssm.brain_region = _ssm_brain_regions.get(layer_idx, f"SSM@L{layer_idx}")

	def forward(
	self,
	hidden_states: torch.Tensor,
	position_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
	use_cache: bool = False,
	) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
	# Save layer input for duplicate gating
	layer_input = hidden_states

	# Pre-norm attention
	residual = hidden_states
	hidden_states = self.input_layernorm(hidden_states)
	hidden_states, new_kv = self.self_attn(
	hidden_states, position_ids, attention_mask,
	past_key_value=past_key_value, use_cache=use_cache,
	)
	hidden_states = residual + hidden_states

	# Pre-norm MLP
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	# Duplicate layer gate: on day 1, gate=0 → layer is pass-through.
	# output = input + gate * (layer_output - input)
	# At gate=0: output = input (skip layer entirely)
	# As gate opens: layer gradually contributes
	if self.is_duplicate:
	gate = self.layer_gate * torch.sigmoid(self.layer_gate)
	hidden_states = layer_input + gate * (hidden_states - layer_input)

	# SSM side-car (additive — no interference on day 1)
	if self.has_ssm:
	hidden_states = hidden_states + self.ssm(hidden_states)

	return hidden_states, new_kv


	# ── Full Model ───────────────────────────────────────────────────────────────

	class EylaBackbone(nn.Module):
	"""
	Eyla V2 Custom Hybrid Backbone.

	Llama-3.2-1B compatible for weight transplant, with custom extensions:
	- SSM side-cars (HiPPO init, zero-gated on day 1)
	- Heuristic surprise gates
	- Heuristic early exit
	- Heuristic complexity estimator

	The model works on day 1 after weight transplant with zero training.
	"""

	def __init__(self, config: Optional[Dict[str, Any]] = None):
	super().__init__()
	self.config = config or EYLA_V2_CONFIG.copy()

	hidden_size = self.config["hidden_size"]
	num_layers = self.config["num_layers"]
	vocab_size = self.config["vocab_size"]

	# Embeddings (matches Llama naming for transplant)
	self.token_embedding = nn.Embedding(vocab_size, hidden_size)

	# Transformer layers
	self.layers = nn.ModuleList([
	TransformerLayer(self.config, layer_idx=i)
	for i in range(num_layers)
	])

	# Final norm
	self.final_norm = RMSNorm(hidden_size, self.config.get("rms_norm_eps", 1e-5))

	# Output head
	if self.config.get("tie_word_embeddings", True):
	self.lm_head = None # Use token_embedding.weight
	else:
	self.lm_head = nn.Linear(hidden_size, vocab_size, bias=False)

	# Memory compressor: use last hidden state (no extra module needed)
	# But keep a simple linear for compatibility with MemoryRetriever (256-d)
	self.memory_compressor = nn.Linear(hidden_size, 256, bias=False)
	nn.init.normal_(self.memory_compressor.weight, std=self.config.get("side_car_init_std", 1e-5))

	# Memory agents at layers 7 and 15: predict expected hidden state
	# Comparison of predicted vs actual = surprise signal for online learning
	# Lazy-initialized via enable_memory_agents() to avoid OOM during model construction
	self.memory_agent_layers = [7, 15]
	self.memory_agents = None
	self._memory_agent_predictions = {}

	# Heuristic gates (NOT nn.Module — no parameters)
	self.gates = HeuristicGates(
	surprise_threshold=self.config.get("surprise_threshold", 4.0),
	exit_confidence=self.config.get("early_exit_confidence", 0.9),
	exit_min_layers=self.config.get("early_exit_min_layers", 4),
	)

	# Brain orchestrator (disabled by default — call enable_brain() to activate)
	self.brain = None

	def get_lm_head_weight(self) -> torch.Tensor:
	"""Get the output projection weight (handles tied embeddings)."""
	if self.lm_head is not None:
	return self.lm_head.weight
	return self.token_embedding.weight

	def embed_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
	"""Raw token embeddings (before any transformer layers)."""
	return self.token_embedding(input_ids)

	def enable_brain(self, config: Optional[Dict[str, Any]] = None):
	"""
	Activate the brain orchestrator (86 brain systems).

	All gates start at 0 → day-1 identity preserved.
	Brain params are trainable; donor params should be frozen separately.
	"""
	from .brain_orchestrator import BrainOrchestrator
	self.brain = BrainOrchestrator(
	d_model=self.config["hidden_size"],
	state_dim=self.config.get("ssm_state_dim", 64),
	config=config,
	)
	brain_summary = self.brain.param_summary()
	logger.info(
	f"Brain enabled: {brain_summary['total_brain_params']:,} params "
	f"(gates: {brain_summary['gate_params']}, "
	f"nn_modules: {brain_summary['nn_module_params']:,})"
	)

	def enable_memory_agents(self):
	"""Initialize memory agents at layers 7 and 15 (call after model load to avoid OOM)."""
	hidden_size = self.config["hidden_size"]
	bottleneck = 128
	init_std = self.config.get("side_car_init_std", 1e-5)
	self.memory_agents = nn.ModuleDict({
	str(l): nn.Sequential(
	nn.Linear(hidden_size, bottleneck, bias=False),
	nn.SiLU(),
	nn.Linear(bottleneck, bottleneck, bias=False),
	nn.SiLU(),
	nn.Linear(bottleneck, hidden_size, bias=False),
	) for l in self.memory_agent_layers
	})
	for key in self.memory_agents:
	nn.init.normal_(self.memory_agents[key][-1].weight, std=init_std)
	total = sum(p.numel() for p in self.memory_agents.parameters())
	logger.info(f"Memory agents enabled at layers {self.memory_agent_layers}: {total:,} params")

	def decode_from_hidden(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	start_layer: int = 0,
	) -> torch.Tensor:
	"""
	Run transformer layers from start_layer onward, then output logits.

	Used by MemConsistencyLoss for teacher pass (memory-augmented decode).

	Args:
	hidden_states: (B, S, d_model)
	attention_mask: (B, S) — 1=attend, 0=pad
	start_layer: skip layers before this index

	Returns:
	logits: (B, S, vocab_size)
	"""
	B, S, _ = hidden_states.shape
	position_ids = torch.arange(S, device=hidden_states.device).unsqueeze(0).expand(B, S)

	for i, layer in enumerate(self.layers):
	if i < start_layer:
	continue
	hidden_states, _ = layer(hidden_states, position_ids, attention_mask)

	hidden_states = self.final_norm(hidden_states)
	# nan_to_num: safety net for random-weight initialization;
	# never triggers with real donor weights
	hidden_states = torch.nan_to_num(hidden_states)
	logits = hidden_states @ self.get_lm_head_weight().T
	return logits

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	early_exit: bool = False,
	return_hidden_states: bool = False,
	past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
	use_cache: bool = False,
	) -> Dict[str, Any]:
	"""
	Full forward pass.

	Args:
	input_ids: (B, S) input token IDs
	attention_mask: (B, S) 1=attend, 0=pad
	early_exit: enable heuristic early exit
	return_hidden_states: return per-layer hidden states
	past_key_values: list of (K, V) tuples per layer for KV cache
	use_cache: if True, return new key_values for caching

	Returns:
	dict with:
	logits: (B, S, vocab_size)
	hidden_states: list of (B, S, d_model) per layer (if requested)
	exit_layer: int — which layer we exited at
	complexity: float — estimated input complexity
	past_key_values: list of (K, V) tuples (if use_cache)
	"""
	B, S = input_ids.shape
	device = input_ids.device

	# Embeddings
	hidden_states = self.token_embedding(input_ids)

	# Position IDs — offset by past sequence length for KV cache
	past_len = past_key_values[0][0].shape[2] if past_key_values is not None else 0
	position_ids = torch.arange(past_len, past_len + S, device=device).unsqueeze(0).expand(B, -1)

	# Estimate complexity from initial embeddings
	complexity = self.gates.complexity.estimate(hidden_states)

	# ── Brain hook 1: pre_layers ─────────────────────────────────────
	if self.brain is not None:
	orig_dtype = hidden_states.dtype
	hidden_states = self.brain.pre_layers(hidden_states.float()).to(orig_dtype)

	# Process through layers
	all_hidden_states = [] if return_hidden_states else None
	new_key_values = [] if use_cache else None
	exit_layer = len(self.layers) - 1
	lm_head_weight = self.get_lm_head_weight()

	self._memory_agent_predictions = {}

	for i, layer in enumerate(self.layers):
	# Memory agent: predict expected hidden state BEFORE this layer
	if self.memory_agents is not None and i in self.memory_agent_layers:
	pred = self.memory_agents[str(i)](hidden_states.float()).to(hidden_states.dtype)
	self._memory_agent_predictions[i] = pred

	past_kv = past_key_values[i] if past_key_values is not None else None
	hidden_states, layer_kv = layer(
	hidden_states, position_ids, attention_mask,
	past_key_value=past_kv, use_cache=use_cache,
	)

	# Memory agent: store actual hidden state AFTER this layer for surprise
	if self.memory_agents is not None and i in self.memory_agent_layers:
	self._memory_agent_predictions[f"{i}_actual"] = hidden_states.detach()

	# ── Brain hook 2: after_layer ────────────────────────────────
	if self.brain is not None:
	ssm_hidden = None
	if layer.has_ssm and hasattr(layer.ssm, 'last_hidden'):
	ssm_hidden = layer.ssm.last_hidden
	orig_dtype = hidden_states.dtype
	ssm_f = ssm_hidden.float() if ssm_hidden is not None else None
	hidden_states = self.brain.after_layer(i, hidden_states.float(), ssm_f).to(orig_dtype)

	if use_cache:
	new_key_values.append(layer_kv)

	if return_hidden_states:
	all_hidden_states.append(hidden_states.detach())

	# Early exit check (heuristic — no learned params)
	if early_exit and i < len(self.layers) - 1:
	should_exit, confidence = self.gates.early_exit.should_exit(
	hidden_states, lm_head_weight, i
	)
	if should_exit:
	exit_layer = i
	break

	# Final norm + output projection
	hidden_states = self.final_norm(hidden_states)
	# nan_to_num: safety for random-weight init; never triggers with donor weights
	hidden_states = torch.nan_to_num(hidden_states)
	logits = hidden_states @ lm_head_weight.T

	# ── Brain hook 3: post_forward ───────────────────────────────────
	brain_state = None
	if self.brain is not None:
	brain_state = self.brain.post_forward(logits.float(), hidden_states.float())

	result = {
	"logits": logits,
	"exit_layer": exit_layer,
	"complexity": complexity,
	"last_hidden_state": hidden_states,
	}

	if brain_state is not None:
	result["brain_state"] = brain_state

	if return_hidden_states:
	result["hidden_states"] = all_hidden_states

	if use_cache:
	result["past_key_values"] = new_key_values

	return result

	def get_memory_agent_surprise(self) -> Dict[int, float]:
	"""Get surprise values from last forward pass (predicted vs actual MSE per layer)."""
	surprises = {}
	for layer_idx in self.memory_agent_layers:
	pred = self._memory_agent_predictions.get(layer_idx)
	actual = self._memory_agent_predictions.get(f"{layer_idx}_actual")
	if pred is not None and actual is not None:
	surprises[layer_idx] = torch.nn.functional.mse_loss(
	pred.float(), actual.float()
	).item()
	return surprises

	def compress_memory(self, hidden_states: torch.Tensor) -> torch.Tensor:
	"""
	Compress hidden states for memory storage.

	Args:
	hidden_states: (B, S, d_model) or (B, d_model)

	Returns:
	(B, 256) compressed memory vector
	"""
	if hidden_states.dim() == 3:
	# Use last token's hidden state
	hidden_states = hidden_states[:, -1, :]
	return self.memory_compressor(hidden_states)

	@torch.no_grad()
	def generate(
	self,
	input_ids: torch.Tensor,
	max_new_tokens: int = 50,
	temperature: float = 0.8,
	top_p: float = 0.9,
	repetition_penalty: float = 1.3,
	) -> torch.Tensor:
	"""
	Autoregressive generation with KV cache for fast inference.

	Args:
	input_ids: (B, S) starting tokens
	max_new_tokens: how many tokens to generate
	temperature: sampling temperature
	top_p: nucleus sampling threshold
	repetition_penalty: penalize repeated tokens (1.0 = off, >1.0 = penalize)

	Returns:
	(B, S + max_new_tokens) generated tokens
	"""
	generated = input_ids.clone()

	# Prefill: process entire prompt, cache KV states
	outputs = self.forward(generated, use_cache=True)
	past_key_values = outputs["past_key_values"]
	next_logits = outputs["logits"][:, -1, :]

	for _ in range(max_new_tokens):
	# Apply repetition penalty before temperature
	if repetition_penalty != 1.0:
	for token_id in set(generated[0].tolist()):
	if next_logits[0, token_id] > 0:
	next_logits[0, token_id] /= repetition_penalty
	else:
	next_logits[0, token_id] *= repetition_penalty

	# Apply temperature
	next_logits = next_logits / temperature

	# Top-p (nucleus) sampling
	sorted_logits, sorted_indices = torch.sort(next_logits, descending=True)
	sorted_probs = F.softmax(sorted_logits, dim=-1)
	cumulative_probs = torch.cumsum(sorted_probs, dim=-1)

	sorted_mask = cumulative_probs - sorted_probs > top_p
	sorted_logits[sorted_mask] = float("-inf")

	probs = F.softmax(sorted_logits, dim=-1)
	next_token_sorted = torch.multinomial(probs, num_samples=1)
	next_token = sorted_indices.gather(-1, next_token_sorted)

	generated = torch.cat([generated, next_token], dim=-1)

	# Stop on EOS (token ID 128001 for Llama 3.2)
	if (next_token == 128001).all():
	break

	# Decode step: only process the new token, reuse cached KV
	outputs = self.forward(next_token, past_key_values=past_key_values, use_cache=True)
	past_key_values = outputs["past_key_values"]
	next_logits = outputs["logits"][:, -1, :]

	return generated

	def get_side_car_params(self) -> List[nn.Parameter]:
	"""Get all side-car parameters (for online learning), including brain params."""
	params = []
	for layer in self.layers:
	if hasattr(layer, "ssm") and layer.has_ssm:
	params.extend(layer.ssm.parameters())
	# Layer gates for duplicate layers (16-23) must be trainable
	if layer.is_duplicate and hasattr(layer, "layer_gate"):
	params.append(layer.layer_gate)
	params.extend(self.memory_compressor.parameters())
	# Memory agent params (layers 7, 15) — when enabled
	if self.memory_agents is not None:
	params.extend(self.memory_agents.parameters())
	# Brain orchestrator params (when enabled)
	if self.brain is not None:
	params.extend(self.brain.get_brain_params())
	return params

	def get_donor_params(self) -> List[nn.Parameter]:
	"""Get all donor (transplanted) parameters."""
	side_car_ids = {id(p) for p in self.get_side_car_params()}
	return [p for p in self.parameters() if id(p) not in side_car_ids]

	def freeze_donor(self):
	"""Freeze all donor parameters (requires_grad=False)."""
	for p in self.get_donor_params():
	p.requires_grad = False
	logger.info("Frozen all donor parameters")

	def unfreeze_side_cars(self):
	"""Ensure side-car parameters are trainable."""
	for p in self.get_side_car_params():
	p.requires_grad = True
	logger.info("Side-car parameters set to trainable")

	def param_summary(self) -> Dict[str, int]:
	"""Count parameters by category."""
	total = sum(p.numel() for p in self.parameters())
	trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
	side_car = sum(p.numel() for p in self.get_side_car_params())
	donor = total - side_car
	return {
	"total": total,
	"trainable": trainable,
	"frozen": total - trainable,
	"donor": donor,
	"side_car": side_car,
	}


	def create_eyla_v2(config: Optional[Dict[str, Any]] = None) -> EylaBackbone:
	"""Factory function to create an Eyla V2 model."""
	model = EylaBackbone(config)
	summary = model.param_summary()
	logger.info(
	f"Created Eyla V2: {summary['total']:,} params "
	f"(donor: {summary['donor']:,}, side-car: {summary['side_car']:,})"
	)
	return model