OpenTransformer
/

AGILLM-3-large

Model card Files Files and versions

xet

Community

OpenTransformer commited on 26 days ago

Commit

a1e7fdb

verified ·

1 Parent(s): 2db758d

Add experiments/n_ultra.py

Browse files

Files changed (1) hide show

experiments/n_ultra.py +715 -0

experiments/n_ultra.py ADDED Viewed

	@@ -0,0 +1,715 @@

+#!/usr/bin/env python3
+"""
+n_ultra.py — ULTRA Heavy Attention Experiments
+Mechanisms that are borderline impractical but theoretically interesting
+1. Neural Turing Machine (NTM) - Full differentiable computer
+2. Energy-Based Attention - Iterative energy minimization
+3. Cross-Layer Attention Lattice - Every layer attends to all others
+4. Continuous Depth (Neural ODE) - Infinite depth limit
+5. Full N-Body Dynamics - Physics-inspired message passing
+6. Hypernetwork Attention - Generate attention weights with another network
+"""
+from __future__ import annotations
+import argparse, math, time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+DEV = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.backends.cuda.matmul.allow_tf32 = True
+try:
+    torch.set_float32_matmul_precision("high")
+except:
+    pass
+VOCAB = 128256
+def _alibi_slopes(n_heads: int):
+    def pow2slopes(n):
+        start = 2 ** (-2 ** -(math.log2(n) - 3))
+        return [start * (start ** i) for i in range(n)]
+    if n_heads > 0 and math.log2(n_heads).is_integer():
+        vals = pow2slopes(n_heads)
+    else:
+        closest = 2 ** math.floor(math.log2(max(1, n_heads)))
+        vals = pow2slopes(closest)
+        extra = pow2slopes(2 * closest)
+        vals += extra[0::2][:n_heads - closest]
+    return torch.tensor(vals, device=DEV).view(1, n_heads, 1, 1)
+def alibi_bias(n_heads: int, n_tokens: int):
+    i = torch.arange(n_tokens, device=DEV).view(1, 1, n_tokens, 1)
+    j = torch.arange(n_tokens, device=DEV).view(1, 1, 1, n_tokens)
+    dist = (j - i).clamp_min(0).float()
+    slopes = _alibi_slopes(n_heads)
+    return -slopes * dist
+def causal_mask(n):
+    return torch.triu(torch.full((1, 1, n, n), float("-inf"), device=DEV), 1)
+# ═══════════════════════════════════════════════════════════════
+# BASELINE
+# ═══════════════════════════════════════════════════════════════
+class StandardAttention(nn.Module):
+    def __init__(self, d: int, h: int):
+        super().__init__()
+        self.h, self.dk = h, d // h
+        self.qkv = nn.Linear(d, 3 * d, bias=False)
+        self.proj = nn.Linear(d, d, bias=False)
+    def forward(self, x, mask=None, **kwargs):
+        B, N, _ = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.h, self.dk).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
+        att = att + alibi_bias(self.h, N)
+        if mask is not None:
+            att = att + mask
+        z = (att.softmax(-1) @ v).transpose(1, 2).reshape(B, N, -1)
+        return self.proj(z)
+# ═══════════════════════════════════════════════════════════════
+# ULTRA 1: Neural Turing Machine (NTM)
+# Full differentiable computer with external memory + read/write heads
+# ═══════════════════════════════════════════════════════════════
+class NTMAttention(nn.Module):
+    """
+    Neural Turing Machine: external memory matrix with content + location addressing.
+    Each forward pass:
+    1. Read from memory using attention over memory slots
+    2. Process with self-attention augmented by memory
+    3. Write to memory using learned write weights
+    Memory operations are fully differentiable.
+    O(n² + n*M*read_heads + M*write_ops)
+    """
+    def __init__(self, d: int, h: int, mem_slots: int = 128, num_heads: int = 4):
+        super().__init__()
+        self.d = d
+        self.h, self.dk = h, d // h
+        self.mem_slots = mem_slots
+        self.num_read_heads = num_heads
+        # Memory (persistent across sequence, reset per batch)
+        self.mem_init = nn.Parameter(torch.randn(1, mem_slots, d) * 0.01)
+        # Read heads - content-based addressing
+        self.read_key = nn.Linear(d, d * num_heads)
+        self.read_beta = nn.Linear(d, num_heads)  # Sharpening
+        self.read_gate = nn.Linear(d, num_heads)  # Interpolation gate
+        self.read_shift = nn.Linear(d, num_heads * 3)  # Location shift (-1, 0, +1)
+        # Write head
+        self.write_key = nn.Linear(d, d)
+        self.write_beta = nn.Linear(d, 1)
+        self.erase_vec = nn.Linear(d, d)
+        self.add_vec = nn.Linear(d, d)
+        # Standard attention components
+        self.qkv = nn.Linear(d, 3 * d, bias=False)
+        self.proj = nn.Linear(d * 2, d, bias=False)  # Concat self-attn + read
+    def _content_addressing(self, memory, keys, betas):
+        """Compute attention weights based on content similarity"""
+        # memory: (B, M, D), keys: (B, N, H, D), betas: (B, N, H)
+        B, M, D = memory.shape
+        _, N, H, _ = keys.shape
+        # Cosine similarity
+        mem_norm = F.normalize(memory, dim=-1)  # (B, M, D)
+        key_norm = F.normalize(keys, dim=-1)  # (B, N, H, D)
+        # (B, N, H, D) @ (B, D, M) -> (B, N, H, M)
+        sim = torch.einsum('bnhd,bmd->bnhm', key_norm, mem_norm)
+        # Sharpen with beta
+        weights = F.softmax(betas.unsqueeze(-1) * sim, dim=-1)  # (B, N, H, M)
+        return weights
+    def _location_shift(self, weights, shift_logits):
+        """Convolutional shift for location-based addressing"""
+        B, N, H, M = weights.shape
+        shift = F.softmax(shift_logits.view(B, N, H, 3), dim=-1)  # (B, N, H, 3)
+        # Manual circular shift instead of padding
+        shifted = torch.zeros_like(weights)
+        shifted += shift[:, :, :, 0:1] * torch.roll(weights, 1, dims=-1)  # left
+        shifted += shift[:, :, :, 1:2] * weights  # center
+        shifted += shift[:, :, :, 2:3] * torch.roll(weights, -1, dims=-1)  # right
+        return shifted
+    def forward(self, x, mask=None, **kwargs):
+        B, N, D = x.shape
+        # Initialize memory for this batch
+        memory = self.mem_init.expand(B, -1, -1).clone()  # (B, M, D)
+        # === READ OPERATION ===
+        read_keys = self.read_key(x).view(B, N, self.num_read_heads, D)
+        read_betas = F.softplus(self.read_beta(x))  # (B, N, H)
+        read_gates = torch.sigmoid(self.read_gate(x))  # (B, N, H)
+        read_shifts = self.read_shift(x)  # (B, N, H*3)
+        # Content-based weights
+        content_weights = self._content_addressing(memory, read_keys, read_betas)
+        # Location-based shift
+        shifted_weights = self._location_shift(content_weights, read_shifts)
+        # Interpolate (simplified - just use content weights)
+        read_weights = content_weights  # (B, N, H, M)
+        # Read from memory
+        # (B, N, H, M) @ (B, M, D) -> (B, N, H, D)
+        read_vectors = torch.einsum('bnhm,bmd->bnhd', read_weights, memory)
+        read_out = read_vectors.mean(dim=2)  # Average across heads (B, N, D)
+        # === SELF-ATTENTION ===
+        qkv = self.qkv(x).reshape(B, N, 3, self.h, self.dk).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
+        att = att + alibi_bias(self.h, N)
+        if mask is not None:
+            att = att + mask
+        self_out = (att.softmax(-1) @ v).transpose(1, 2).reshape(B, N, -1)
+        # === WRITE OPERATION ===
+        write_key = self.write_key(x[:, -1:, :])  # Use last position (B, 1, D)
+        write_beta = F.softplus(self.write_beta(x[:, -1:, :]))
+        write_weights = self._content_addressing(
+            memory,
+            write_key.unsqueeze(2),  # (B, 1, 1, D)
+            write_beta.squeeze(-1).unsqueeze(-1)  # (B, 1, 1)
+        ).squeeze(2)  # (B, 1, M)
+        # Erase and add
+        erase = torch.sigmoid(self.erase_vec(x[:, -1:, :]))  # (B, 1, D)
+        add = self.add_vec(x[:, -1:, :])  # (B, 1, D)
+        # Memory update (for next call - not used in this forward)
+        # memory = memory * (1 - write_weights.transpose(-1,-2) @ erase)
+        # memory = memory + write_weights.transpose(-1,-2) @ add
+        # Combine self-attention and memory read
+        combined = torch.cat([self_out, read_out], dim=-1)
+        return self.proj(combined)
+# ═══════════════════════════════════════════════════════════════
+# ULTRA 2: Energy-Based Attention
+# Iterative energy minimization instead of single softmax
+# ═══════════════════════════════════════════════════════════════
+class EnergyAttention(nn.Module):
+    """
+    Energy-based model for attention: find attention weights that minimize energy.
+    E(a, q, k, v) = -sum(a_ij * sim(q_i, k_j)) + entropy(a) + prior
+    Iterate gradient descent on attention weights until convergence.
+    Much heavier than softmax but potentially more expressive.
+    O(iters * n²)
+    """
+    def __init__(self, d: int, h: int, num_iters: int = 10, step_size: float = 0.5):
+        super().__init__()
+        self.h, self.dk = h, d // h
+        self.num_iters = num_iters
+        self.step_size = step_size
+        self.qkv = nn.Linear(d, 3 * d, bias=False)
+        self.proj = nn.Linear(d, d, bias=False)
+        # Learnable energy function parameters
+        self.energy_scale = nn.Parameter(torch.ones(h))
+        self.temperature = nn.Parameter(torch.ones(h) * 0.1)
+    def _compute_energy(self, attn_logits, attn_weights, mask):
+        """
+        Energy = -similarity + temperature * entropy
+        Lower energy = better attention pattern
+        """
+        # Similarity term (want to maximize, so negate)
+        sim_energy = -attn_logits * attn_weights
+        # Entropy regularization (encourage sharpness)
+        entropy = -attn_weights * torch.log(attn_weights + 1e-10)
+        # Total energy per head
+        temp = self.temperature.view(1, -1, 1, 1)
+        energy = sim_energy.sum(dim=-1) + temp * entropy.sum(dim=-1)
+        return energy.mean()
+    def forward(self, x, mask=None, **kwargs):
+        B, N, _ = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.h, self.dk).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # Initial attention logits
+        scale = self.energy_scale.view(1, -1, 1, 1)
+        attn_logits = scale * (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
+        attn_logits = attn_logits + alibi_bias(self.h, N)
+        if mask is not None:
+            attn_logits = attn_logits + mask
+        # Initialize attention weights with softmax
+        attn_weights = F.softmax(attn_logits, dim=-1)
+        # Iterative refinement via energy minimization
+        for _ in range(self.num_iters):
+            # Compute gradient of energy w.r.t. attention weights
+            # Simplified: use attention logits as gradient signal
+            # Energy gradient approximation
+            with torch.enable_grad():
+                attn_weights_param = attn_weights.detach().requires_grad_(True)
+                energy = self._compute_energy(attn_logits, attn_weights_param, mask)
+                grad = torch.autograd.grad(energy, attn_weights_param)[0]
+            # Gradient step in logit space
+            attn_logits_new = attn_logits - self.step_size * grad
+            # Project back to valid distribution
+            if mask is not None:
+                attn_logits_new = attn_logits_new + mask
+            attn_weights = F.softmax(attn_logits_new, dim=-1)
+        z = (attn_weights @ v).transpose(1, 2).reshape(B, N, -1)
+        return self.proj(z)
+# ═══════════════════════════════════════════════════════════════
+# ULTRA 3: Cross-Layer Attention Lattice
+# Every layer can attend to outputs of ALL other layers
+# ═══════════════════════════════════════════════════════════════
+class LatticeAttention(nn.Module):
+    """
+    Instead of sequential layers, create a lattice where each layer
+    can attend to all other layers' outputs.
+    Requires storing all layer outputs and recomputing.
+    O(L² * n²) where L = number of layers
+    This is implemented at the model level, not attention level.
+    """
+    def __init__(self, d: int, h: int, cross_layers: int = 4):
+        super().__init__()
+        self.h, self.dk = h, d // h
+        self.cross_layers = cross_layers
+        # Self-attention
+        self.qkv = nn.Linear(d, 3 * d, bias=False)
+        # Cross-layer attention (query current, key/value from other layers)
+        self.cross_q = nn.Linear(d, d, bias=False)
+        self.cross_kv = nn.Linear(d, 2 * d, bias=False)
+        # Combine self and cross
+        self.proj = nn.Linear(d * 2, d, bias=False)
+        # Store for lattice
+        self.layer_outputs = None
+    def forward(self, x, mask=None, layer_idx=0, all_layers=None, **kwargs):
+        B, N, _ = x.shape
+        # Self-attention
+        qkv = self.qkv(x).reshape(B, N, 3, self.h, self.dk).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
+        att = att + alibi_bias(self.h, N)
+        if mask is not None:
+            att = att + mask
+        self_out = (att.softmax(-1) @ v).transpose(1, 2).reshape(B, N, -1)
+        # Cross-layer attention (if we have other layer outputs)
+        if all_layers is not None and len(all_layers) > 0:
+            # Stack all previous layer outputs
+            stacked = torch.stack(all_layers, dim=2)  # (B, N, L, D)
+            B, N, L, D = stacked.shape
+            # Query from current, key/value from all layers
+            cross_q = self.cross_q(x).view(B, N, self.h, self.dk)  # (B, N, H, dk)
+            # Reshape for cross attention
+            stacked_flat = stacked.view(B, N * L, D)
+            cross_kv = self.cross_kv(stacked_flat).view(B, N * L, 2, self.h, self.dk)
+            cross_k, cross_v = cross_kv[:, :, 0], cross_kv[:, :, 1]
+            # Cross attention
+            cross_q = cross_q.transpose(1, 2)  # (B, H, N, dk)
+            cross_k = cross_k.view(B, N * L, self.h, self.dk).transpose(1, 2)
+            cross_v = cross_v.view(B, N * L, self.h, self.dk).transpose(1, 2)
+            cross_att = (cross_q @ cross_k.transpose(-1, -2)) / math.sqrt(self.dk)
+            cross_out = (cross_att.softmax(-1) @ cross_v).transpose(1, 2).reshape(B, N, -1)
+        else:
+            cross_out = torch.zeros_like(self_out)
+        combined = torch.cat([self_out, cross_out], dim=-1)
+        return self.proj(combined)
+# ═══════════════════════════════════════════════════════════════
+# ULTRA 4: N-Body Dynamics Attention
+# Treat tokens as particles with forces between them
+# ═══════════════════════════════════════════════════════════════
+class NBodyAttention(nn.Module):
+    """
+    Physics-inspired: tokens are particles with forces.
+    Simplified version that avoids shape complexity.
+    """
+    def __init__(self, d: int, h: int, num_steps: int = 5, dt: float = 0.1):
+        super().__init__()
+        self.d = d
+        self.num_steps = num_steps
+        self.dt = dt
+        self.to_pos = nn.Linear(d, d)
+        self.to_vel = nn.Linear(d, d)
+        # Simplified force: pairwise similarity drives attraction
+        self.force_scale = nn.Parameter(torch.ones(1) * 0.1)
+        self.out_proj = nn.Linear(d * 2, d)
+    def forward(self, x, mask=None, **kwargs):
+        B, N, D = x.shape
+        pos = self.to_pos(x)
+        vel = self.to_vel(x)
+        # Causal mask
+        causal = torch.triu(torch.ones(N, N, device=x.device), diagonal=1)
+        causal_mask = 1.0 - causal  # (N, N) lower triangular
+        for _ in range(self.num_steps):
+            # Pairwise distances
+            pos_diff = pos.unsqueeze(2) - pos.unsqueeze(1)  # (B, N, N, D)
+            dist_sq = (pos_diff ** 2).sum(-1, keepdim=True) + 1e-6  # (B, N, N, 1)
+            # Force proportional to 1/distance (like gravity)
+            force_dir = pos_diff / (dist_sq.sqrt() + 1e-6)  # (B, N, N, D)
+            force_mag = self.force_scale / dist_sq  # (B, N, N, 1)
+            forces = force_dir * force_mag  # (B, N, N, D)
+            # Apply causal mask
+            forces = forces * causal_mask.view(1, N, N, 1)
+            # Sum forces
+            total_force = forces.sum(dim=2)  # (B, N, D)
+            # Update
+            vel = vel + self.dt * total_force
+            pos = pos + self.dt * vel
+        out = torch.cat([pos, vel], dim=-1)
+        return self.out_proj(out)
+# ═══════════════════════════════════════════════════════════════
+# ULTRA 5: Hypernetwork Attention
+# A separate network generates the attention weights
+# ═══════════════════════════════════════════════════════════════
+class HyperAttention(nn.Module):
+    """
+    Instead of QK^T -> softmax, use a hypernetwork to generate attention.
+    The hypernetwork takes (query_token, key_token) and outputs attention weight.
+    Much more expressive but O(n² * hypernetwork_cost).
+    """
+    def __init__(self, d: int, h: int, hyper_hidden: int = 64):
+        super().__init__()
+        self.h, self.dk = h, d // h
+        self.to_q = nn.Linear(d, d, bias=False)
+        self.to_k = nn.Linear(d, d, bias=False)
+        self.to_v = nn.Linear(d, d, bias=False)
+        # Hypernetwork: generates attention weight from (q, k) pair
+        self.hypernet = nn.Sequential(
+            nn.Linear(self.dk * 2, hyper_hidden),
+            nn.SiLU(),
+            nn.Linear(hyper_hidden, hyper_hidden),
+            nn.SiLU(),
+            nn.Linear(hyper_hidden, 1)
+        )
+        self.proj = nn.Linear(d, d, bias=False)
+    def forward(self, x, mask=None, **kwargs):
+        B, N, _ = x.shape
+        q = self.to_q(x).view(B, N, self.h, self.dk)  # (B, N, H, dk)
+        k = self.to_k(x).view(B, N, self.h, self.dk)
+        v = self.to_v(x).view(B, N, self.h, self.dk)
+        # Compute attention via hypernetwork
+        # Need to process all (i, j) pairs
+        attn_logits = torch.zeros(B, self.h, N, N, device=x.device)
+        for head in range(self.h):
+            q_h = q[:, :, head, :]  # (B, N, dk)
+            k_h = k[:, :, head, :]
+            # Expand for pairwise
+            q_exp = q_h.unsqueeze(2).expand(-1, -1, N, -1)  # (B, N, N, dk)
+            k_exp = k_h.unsqueeze(1).expand(-1, N, -1, -1)  # (B, N, N, dk)
+            # Concatenate and run through hypernetwork
+            pair_input = torch.cat([q_exp, k_exp], dim=-1)  # (B, N, N, 2*dk)
+            attn_logits[:, head] = self.hypernet(pair_input).squeeze(-1)  # (B, N, N)
+        # Add ALiBi bias
+        attn_logits = attn_logits + alibi_bias(self.h, N)
+        if mask is not None:
+            attn_logits = attn_logits + mask
+        attn_weights = F.softmax(attn_logits, dim=-1)  # (B, H, N, N)
+        # Apply attention
+        v = v.transpose(1, 2)  # (B, H, N, dk)
+        out = (attn_weights @ v).transpose(1, 2).reshape(B, N, -1)
+        return self.proj(out)
+# ═══════════════════════════════════════════════════════════════
+# ULTRA 6: Differentiable Sorting Attention
+# Sort tokens by relevance, attend in sorted order
+# ═══════════════════════════════════════════════════════════════
+class SortingAttention(nn.Module):
+    """
+    Differentiable sorting: learn to reorder tokens by importance,
+    then apply attention in sorted space.
+    Uses Sinkhorn operator for soft permutation matrices.
+    O(sinkhorn_iters * n² + n²)
+    """
+    def __init__(self, d: int, h: int, sinkhorn_iters: int = 10, temp: float = 0.1):
+        super().__init__()
+        self.h, self.dk = h, d // h
+        self.sinkhorn_iters = sinkhorn_iters
+        self.temp = temp
+        # Scoring network for sorting
+        self.score = nn.Linear(d, 1)
+        # Standard attention
+        self.qkv = nn.Linear(d, 3 * d, bias=False)
+        self.proj = nn.Linear(d, d, bias=False)
+    def _sinkhorn(self, log_alpha, iters):
+        """Sinkhorn normalization for soft permutation"""
+        for _ in range(iters):
+            log_alpha = log_alpha - torch.logsumexp(log_alpha, dim=-1, keepdim=True)
+            log_alpha = log_alpha - torch.logsumexp(log_alpha, dim=-2, keepdim=True)
+        return torch.exp(log_alpha)
+    def forward(self, x, mask=None, **kwargs):
+        B, N, D = x.shape
+        # Compute sorting scores
+        scores = self.score(x).squeeze(-1)  # (B, N)
+        # Create soft permutation matrix via Sinkhorn
+        # log_alpha[i,j] = score[i] (want row i to go to position based on score)
+        log_alpha = scores.unsqueeze(-1) - scores.unsqueeze(-2)  # (B, N, N)
+        log_alpha = log_alpha / self.temp
+        perm = self._sinkhorn(log_alpha, self.sinkhorn_iters)  # (B, N, N)
+        # Apply permutation to get sorted tokens
+        x_sorted = torch.einsum('bnm,bmd->bnd', perm, x)  # (B, N, D)
+        # Standard attention on sorted tokens
+        qkv = self.qkv(x_sorted).reshape(B, N, 3, self.h, self.dk).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
+        att = att + alibi_bias(self.h, N)
+        if mask is not None:
+            att = att + mask
+        out_sorted = (att.softmax(-1) @ v).transpose(1, 2).reshape(B, N, -1)
+        # Inverse permutation to restore order
+        perm_inv = perm.transpose(-1, -2)
+        out = torch.einsum('bnm,bmd->bnd', perm_inv, out_sorted)
+        return self.proj(out)
+# ═══════════════════════════════════════════════════════════════
+# Block and Model
+# ═══════════════════════════════════════════════════════════════
+class Block(nn.Module):
+    def __init__(self, d: int, h: int, attn_type: str = "standard", **kwargs):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(d)
+        self.ln2 = nn.LayerNorm(d)
+        attn_map = {
+            "standard": StandardAttention,
+            "ntm": NTMAttention,
+            "energy": EnergyAttention,
+            "lattice": LatticeAttention,
+            "nbody": NBodyAttention,
+            "hyper": HyperAttention,
+            "sorting": SortingAttention,
+        }
+        if attn_type not in attn_map:
+            raise ValueError(f"Unknown: {attn_type}")
+        self.attn = attn_map[attn_type](d, h, **kwargs)
+        self.attn_type = attn_type
+        self.ff = nn.Sequential(
+            nn.Linear(d, 4 * d),
+            nn.GELU(),
+            nn.Linear(4 * d, d)
+        )
+    def forward(self, x, mask=None, **kwargs):
+        x = x + self.attn(self.ln1(x), mask, **kwargs)
+        x = x + self.ff(self.ln2(x))
+        return x
+class UltraModel(nn.Module):
+    def __init__(self, d: int, layers: int, h: int, attn_type: str = "standard", **kwargs):
+        super().__init__()
+        self.emb = nn.Embedding(VOCAB, d)
+        self.blocks = nn.ModuleList([Block(d, h, attn_type, **kwargs) for _ in range(layers)])
+        self.ln = nn.LayerNorm(d)
+        self.head = nn.Linear(d, VOCAB, bias=False)
+        self.head.weight = self.emb.weight
+        self.attn_type = attn_type
+    def forward(self, x, mask=None):
+        x = self.emb(x)
+        if self.attn_type == "lattice":
+            all_layers = []
+            for blk in self.blocks:
+                x = blk(x, mask, all_layers=all_layers)
+                all_layers.append(x.detach())
+        else:
+            for blk in self.blocks:
+                x = blk(x, mask)
+        return self.head(self.ln(x))
+    def count_params(self):
+        return sum(p.numel() for p in self.parameters())
+# ═══════════════════════════════════════════════════════════════
+# Experiment Runner
+# ═══════════════════════════════════════════════════════════════
+def run_experiment(attn_type, d, layers, heads, batch, seq, steps, **kwargs):
+    print(f"\n{'='*60}")
+    print(f"ULTRA ATTENTION: {attn_type.upper()}")
+    print(f"{'='*60}")
+    try:
+        model = UltraModel(d, layers, heads, attn_type, **kwargs).to(DEV)
+    except Exception as e:
+        print(f"Failed to create model: {e}")
+        return None
+    print(f"Parameters: {model.count_params():,}")
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
+    mask = causal_mask(seq - 1)
+    losses, times = [], []
+    for step in range(steps):
+        ids = torch.randint(0, VOCAB, (batch, seq), device=DEV)
+        target = ids[:, 1:]
+        input_ids = ids[:, :-1]
+        start = time.time()
+        optimizer.zero_grad()
+        try:
+            logits = model(input_ids, mask)
+            loss = F.cross_entropy(logits.view(-1, VOCAB), target.reshape(-1))
+            loss.backward()
+            optimizer.step()
+        except RuntimeError as e:
+            print(f"Step {step} failed: {e}")
+            break
+        elapsed = time.time() - start
+        losses.append(loss.item())
+        times.append(elapsed)
+        tok_s = (batch * seq) / elapsed
+        if step % 10 == 0 or step == steps - 1:
+            print(f"Step {step:3d} | Loss: {loss.item():.4f} | {tok_s:.0f} tok/s | {elapsed*1000:.0f}ms")
+    if not losses:
+        return None
+    avg_loss = sum(losses[-20:]) / min(20, len(losses))
+    avg_time = sum(times[-20:]) / min(20, len(times))
+    avg_toks = (batch * seq) / avg_time
+    return {"type": attn_type, "loss": avg_loss, "tok_s": avg_toks, "params": model.count_params()}
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--d", type=int, default=256)
+    parser.add_argument("--layers", type=int, default=4)
+    parser.add_argument("--heads", type=int, default=8)
+    parser.add_argument("--batch", type=int, default=8)
+    parser.add_argument("--seq", type=int, default=64)  # Shorter for ultra-heavy
+    parser.add_argument("--steps", type=int, default=50)
+    parser.add_argument("--types", type=str, default="all")
+    args = parser.parse_args()
+    print(f"Device: {DEV}")
+    if torch.cuda.is_available():
+        print(f"GPU: {torch.cuda.get_device_name()}")
+    if args.types == "all":
+        types = ["standard", "ntm", "energy", "nbody", "hyper", "sorting"]
+    else:
+        types = [t.strip() for t in args.types.split(",")]
+    results = []
+    for t in types:
+        r = run_experiment(t, args.d, args.layers, args.heads,
+                          args.batch, args.seq, args.steps)
+        if r:
+            results.append(r)
+        torch.cuda.empty_cache()
+    print(f"\n{'='*60}")
+    print("SUMMARY")
+    print(f"{'='*60}")
+    baseline = next((r for r in results if r['type'] == 'standard'), None)
+    for r in results:
+        rel = ""
+        if baseline and r['type'] != 'standard':
+            loss_diff = (baseline['loss'] - r['loss']) / baseline['loss'] * 100
+            speed_ratio = r['tok_s'] / baseline['tok_s']
+            rel = f" | vs std: {loss_diff:+.1f}% loss, {speed_ratio:.2f}x speed"
+        print(f"{r['type']:12s} | Loss: {r['loss']:.4f} | {r['tok_s']:6.0f} tok/s{rel}")
+if __name__ == "__main__":
+    main()