#!/usr/bin/env python3
"""
n_heavy2.py — Extended Heavy Attention Experiments
Testing mechanisms that use MORE compute than standard attention

Approaches:
1. Multi-Hop: Explicit k-step reasoning chains
2. Slot Attention: Competitive binding (from object-centric learning)
3. Edge-Compute: Full pairwise MLP, not just weighted sum
4. Memory-Aug: External memory bank with read/write
5. Recurrent Depth: Same block applied k times (Universal Transformer)
"""

from __future__ import annotations
import argparse, math, time
import torch
import torch.nn as nn
import torch.nn.functional as F

DEV = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cuda.matmul.allow_tf32 = True
try:
    torch.set_float32_matmul_precision("high")
except:
    pass

VOCAB = 128256
EOS = 128001

# ─────────────────────────── ALiBi ───────────────────────────
def _alibi_slopes(n_heads: int):
    def pow2slopes(n):
        start = 2 ** (-2 ** -(math.log2(n) - 3))
        return [start * (start ** i) for i in range(n)]
    if math.log2(n_heads).is_integer():
        vals = pow2slopes(n_heads)
    else:
        closest = 2 ** math.floor(math.log2(n_heads))
        vals = pow2slopes(closest)
        extra = pow2slopes(2 * closest)
        vals += extra[0::2][:n_heads - closest]
    return torch.tensor(vals, device=DEV).view(1, n_heads, 1, 1)

def alibi_bias(n_heads: int, n_tokens: int):
    i = torch.arange(n_tokens, device=DEV).view(1, 1, n_tokens, 1)
    j = torch.arange(n_tokens, device=DEV).view(1, 1, 1, n_tokens)
    dist = (j - i).clamp_min(0).float()
    slopes = _alibi_slopes(n_heads)
    return -slopes * dist

def causal_mask(n):
    return torch.triu(torch.full((1, 1, n, n), float("-inf"), device=DEV), 1)


# ═══════════════════════════════════════════════════════════════
# BASELINE: Standard Attention
# ═══════════════════════════════════════════════════════════════
class StandardAttention(nn.Module):
    def __init__(self, d: int, h: int):
        super().__init__()
        assert d % h == 0
        self.h, self.dk = h, d // h
        self.qkv = nn.Linear(d, 3 * d, bias=False)
        self.proj = nn.Linear(d, d, bias=False)

    def forward(self, x, mask=None):
        B, N, _ = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.h, self.dk).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
        att = att + alibi_bias(self.h, N)
        if mask is not None:
            att = att + mask
        z = (att.softmax(-1) @ v).transpose(1, 2).reshape(B, N, -1)
        return self.proj(z)


# ═══════════════════════════════════════════════════════════════
# HEAVY 1: Multi-Hop Attention
# Each "hop" attends to previous hop's output
# Simulates multi-step reasoning chains
# ═══════════════════════════════════════════════════════════════
class MultiHopAttention(nn.Module):
    """
    K explicit reasoning hops. Each hop:
    1. Attend to current state
    2. Update state with attended info
    3. Next hop attends to updated state
    
    O(k * n²) - linear in hops, quadratic in sequence
    """
    def __init__(self, d: int, h: int, num_hops: int = 3):
        super().__init__()
        self.h, self.dk = h, d // h
        self.num_hops = num_hops
        
        # Separate Q projection per hop (K,V shared)
        self.q_projs = nn.ModuleList([nn.Linear(d, d, bias=False) for _ in range(num_hops)])
        self.kv = nn.Linear(d, 2 * d, bias=False)
        self.proj = nn.Linear(d, d, bias=False)
        
        # Hop mixing: combine info from all hops
        self.hop_gate = nn.Linear(d * num_hops, d)

    def forward(self, x, mask=None):
        B, N, D = x.shape
        
        # Compute K, V once (shared across hops)
        kv = self.kv(x).reshape(B, N, 2, self.h, self.dk).permute(2, 0, 3, 1, 4)
        k, v = kv[0], kv[1]
        
        bias = alibi_bias(self.h, N)
        hop_outputs = []
        state = x
        
        for hop in range(self.num_hops):
            # Query from current state
            q = self.q_projs[hop](state).reshape(B, N, self.h, self.dk).transpose(1, 2)
            
            att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
            att = att + bias
            if mask is not None:
                att = att + mask
            
            hop_out = (att.softmax(-1) @ v).transpose(1, 2).reshape(B, N, -1)
            hop_outputs.append(hop_out)
            
            # Update state for next hop
            state = state + hop_out
        
        # Combine all hops
        combined = torch.cat(hop_outputs, dim=-1)
        return self.proj(self.hop_gate(combined))


# ═══════════════════════════════════════════════════════════════
# HEAVY 2: Slot Attention
# From "Object-Centric Learning with Slot Attention"
# Slots compete to bind to input positions
# ═══════════════════════════════════════════════════════════════
class SlotAttention(nn.Module):
    """
    Competitive binding: K slots compete for N positions.
    Unlike standard attention (N queries), we have K << N slots.
    
    Each slot iteratively refines what it attends to.
    Then we project slots back to sequence.
    
    O(iterations * K * N) where K = num_slots
    """
    def __init__(self, d: int, num_slots: int = 8, num_iters: int = 3):
        super().__init__()
        self.num_slots = num_slots
        self.num_iters = num_iters
        self.d = d
        
        # Learnable slot initializations
        self.slots_mu = nn.Parameter(torch.randn(1, num_slots, d) * 0.02)
        self.slots_sigma = nn.Parameter(torch.ones(1, num_slots, d) * 0.02)
        
        # Attention
        self.to_q = nn.Linear(d, d, bias=False)
        self.to_k = nn.Linear(d, d, bias=False)
        self.to_v = nn.Linear(d, d, bias=False)
        
        # Slot update GRU
        self.gru = nn.GRUCell(d, d)
        self.mlp = nn.Sequential(
            nn.Linear(d, d * 2),
            nn.ReLU(),
            nn.Linear(d * 2, d)
        )
        self.ln1 = nn.LayerNorm(d)
        self.ln2 = nn.LayerNorm(d)
        
        # Project slots back to sequence
        self.slot_to_seq = nn.Linear(d, d)

    def forward(self, x, mask=None):
        B, N, D = x.shape
        
        # Initialize slots with noise
        slots = self.slots_mu + self.slots_sigma * torch.randn(B, self.num_slots, D, device=x.device)
        
        # Pre-compute keys and values
        k = self.to_k(x)  # (B, N, D)
        v = self.to_v(x)  # (B, N, D)
        
        for _ in range(self.num_iters):
            slots_prev = slots
            slots = self.ln1(slots)
            
            # Slot attention: slots query, inputs are keys/values
            q = self.to_q(slots)  # (B, K, D)
            
            # Attention: (B, K, D) @ (B, D, N) -> (B, K, N)
            attn = torch.einsum('bkd,bnd->bkn', q, k) / math.sqrt(D)
            
            # Softmax over SLOTS (competition) not positions
            attn = F.softmax(attn, dim=1)  # Slots compete for each position
            
            # Weighted sum of values
            updates = torch.einsum('bkn,bnd->bkd', attn, v)  # (B, K, D)
            
            # GRU update
            slots = self.gru(
                updates.reshape(B * self.num_slots, D),
                slots_prev.reshape(B * self.num_slots, D)
            ).reshape(B, self.num_slots, D)
            
            # MLP residual
            slots = slots + self.mlp(self.ln2(slots))
        
        # Project slots back to sequence length
        # Use attention from slots to positions
        q_out = self.to_q(x)  # (B, N, D)
        k_slots = self.to_k(slots)  # (B, K, D)
        
        attn_out = torch.einsum('bnd,bkd->bnk', q_out, k_slots) / math.sqrt(D)
        attn_out = F.softmax(attn_out, dim=-1)  # (B, N, K)
        
        output = torch.einsum('bnk,bkd->bnd', attn_out, slots)
        return self.slot_to_seq(output)


# ═══════════════════════════════════════════════════════════════
# HEAVY 3: Edge-Compute Attention
# Instead of weighted sum, compute MLP on each (query, key) pair
# ═══════════════════════════════════════════════════════════════
class EdgeComputeAttention(nn.Module):
    """
    Standard attention: output = softmax(QK^T) @ V
    This is just a weighted sum - no computation on relationships.
    
    Edge-Compute: For each (i,j) pair, run MLP([q_i; k_j; v_j])
    Then aggregate. Much heavier but captures richer interactions.
    
    O(n² * mlp_cost) - quadratic with multiplicative MLP factor
    
    Note: Only practical for short sequences!
    """
    def __init__(self, d: int, h: int, max_seq: int = 128):
        super().__init__()
        self.h, self.dk = h, d // h
        self.max_seq = max_seq
        
        self.qkv = nn.Linear(d, 3 * d, bias=False)
        
        # Edge MLP: processes each (q_i, k_j, v_j) triple
        self.edge_mlp = nn.Sequential(
            nn.Linear(3 * self.dk, 2 * self.dk),
            nn.ReLU(),
            nn.Linear(2 * self.dk, self.dk)
        )
        
        # Attention for aggregation
        self.score_mlp = nn.Sequential(
            nn.Linear(2 * self.dk, self.dk),
            nn.ReLU(),
            nn.Linear(self.dk, 1)
        )
        
        self.proj = nn.Linear(d, d, bias=False)

    def forward(self, x, mask=None):
        B, N, D = x.shape
        
        # For long sequences, fall back to standard
        if N > self.max_seq:
            return self._standard_forward(x, mask)
        
        qkv = self.qkv(x).reshape(B, N, 3, self.h, self.dk)
        q, k, v = qkv[:,:,0], qkv[:,:,1], qkv[:,:,2]  # Each: (B, N, H, dk)
        
        outputs = []
        for head in range(self.h):
            q_h = q[:, :, head, :]  # (B, N, dk)
            k_h = k[:, :, head, :]
            v_h = v[:, :, head, :]
            
            # Expand for pairwise: (B, N, 1, dk) and (B, 1, N, dk)
            q_exp = q_h.unsqueeze(2).expand(-1, -1, N, -1)  # (B, N, N, dk)
            k_exp = k_h.unsqueeze(1).expand(-1, N, -1, -1)  # (B, N, N, dk)
            v_exp = v_h.unsqueeze(1).expand(-1, N, -1, -1)  # (B, N, N, dk)
            
            # Concatenate for edge MLP
            edge_input = torch.cat([q_exp, k_exp, v_exp], dim=-1)  # (B, N, N, 3*dk)
            
            # Compute edge features
            edge_features = self.edge_mlp(edge_input)  # (B, N, N, dk)
            
            # Compute attention scores
            score_input = torch.cat([q_exp, k_exp], dim=-1)  # (B, N, N, 2*dk)
            scores = self.score_mlp(score_input).squeeze(-1)  # (B, N, N)
            
            # Apply causal mask
            if mask is not None:
                scores = scores + mask.squeeze(1)
            
            # Aggregate
            weights = F.softmax(scores, dim=-1)  # (B, N, N)
            head_out = (weights.unsqueeze(-1) * edge_features).sum(dim=2)  # (B, N, dk)
            outputs.append(head_out)
        
        out = torch.cat(outputs, dim=-1)  # (B, N, D)
        return self.proj(out)
    
    def _standard_forward(self, x, mask=None):
        B, N, _ = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.h, self.dk).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
        att = att + alibi_bias(self.h, N)
        if mask is not None:
            att = att + mask
        z = (att.softmax(-1) @ v).transpose(1, 2).reshape(B, N, -1)
        return self.proj(z)


# ═══════════════════════════════════════════════════════════════
# HEAVY 4: Memory-Augmented Attention
# External memory bank with read/write operations
# ═══════════════════════════════════════════════════════════════
class MemoryAugmentedAttention(nn.Module):
    """
    Maintain external memory bank M of size (mem_size, d).
    Each forward:
    1. Read from memory using attention
    2. Standard self-attention augmented with memory content
    3. Write updated info back to memory
    
    O(n² + n*mem_size) - adds memory interaction cost
    """
    def __init__(self, d: int, h: int, mem_size: int = 64):
        super().__init__()
        self.h, self.dk = h, d // h
        self.mem_size = mem_size
        
        # Persistent memory (learned)
        self.memory = nn.Parameter(torch.randn(1, mem_size, d) * 0.02)
        
        # Standard attention
        self.qkv = nn.Linear(d, 3 * d, bias=False)
        self.proj = nn.Linear(d, d, bias=False)
        
        # Memory read/write
        self.mem_q = nn.Linear(d, d, bias=False)
        self.mem_k = nn.Linear(d, d, bias=False)
        self.mem_v = nn.Linear(d, d, bias=False)
        
        # Write gate
        self.write_gate = nn.Sequential(
            nn.Linear(d * 2, d),
            nn.Sigmoid()
        )
        
        # Combine self-attention and memory
        self.combine = nn.Linear(d * 2, d)

    def forward(self, x, mask=None):
        B, N, D = x.shape
        
        # Expand memory for batch
        mem = self.memory.expand(B, -1, -1)  # (B, mem_size, D)
        
        # 1. Read from memory
        q_mem = self.mem_q(x)  # (B, N, D)
        k_mem = self.mem_k(mem)  # (B, mem_size, D)
        v_mem = self.mem_v(mem)  # (B, mem_size, D)
        
        mem_attn = torch.einsum('bnd,bmd->bnm', q_mem, k_mem) / math.sqrt(D)
        mem_attn = F.softmax(mem_attn, dim=-1)
        mem_read = torch.einsum('bnm,bmd->bnd', mem_attn, v_mem)  # (B, N, D)
        
        # 2. Standard self-attention
        qkv = self.qkv(x).reshape(B, N, 3, self.h, self.dk).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        
        att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
        att = att + alibi_bias(self.h, N)
        if mask is not None:
            att = att + mask
        self_out = (att.softmax(-1) @ v).transpose(1, 2).reshape(B, N, -1)
        
        # 3. Combine self-attention and memory read
        combined = self.combine(torch.cat([self_out, mem_read], dim=-1))
        
        return self.proj(combined)


# ═══════════════════════════════════════════════════════════════
# HEAVY 5: Recurrent Depth (Universal Transformer)
# Same block applied k times with position-in-depth encoding
# ═══════════════════════════════════════════════════════════════
class RecurrentDepthAttention(nn.Module):
    """
    Instead of L different layers, use 1 layer L times.
    Add depth embedding so model knows which iteration it's on.
    
    O(k * n²) where k = num_recurrences
    
    Key insight: Weight sharing + depth embedding = potentially more
    efficient use of parameters for complex reasoning.
    """
    def __init__(self, d: int, h: int, num_recur: int = 4):
        super().__init__()
        self.h, self.dk = h, d // h
        self.num_recur = num_recur
        
        self.qkv = nn.Linear(d, 3 * d, bias=False)
        self.proj = nn.Linear(d, d, bias=False)
        
        # Depth embedding
        self.depth_emb = nn.Embedding(num_recur, d)
        
        # Transition function between recurrences
        self.transition = nn.Sequential(
            nn.LayerNorm(d),
            nn.Linear(d, d * 2),
            nn.GELU(),
            nn.Linear(d * 2, d)
        )

    def forward(self, x, mask=None):
        B, N, D = x.shape
        bias = alibi_bias(self.h, N)
        
        for r in range(self.num_recur):
            # Add depth embedding
            x_r = x + self.depth_emb.weight[r].unsqueeze(0).unsqueeze(0)
            
            # Self-attention
            qkv = self.qkv(x_r).reshape(B, N, 3, self.h, self.dk).permute(2, 0, 3, 1, 4)
            q, k, v = qkv[0], qkv[1], qkv[2]
            
            att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
            att = att + bias
            if mask is not None:
                att = att + mask
            
            attn_out = (att.softmax(-1) @ v).transpose(1, 2).reshape(B, N, -1)
            attn_out = self.proj(attn_out)
            
            # Residual + transition
            x = x + attn_out
            x = x + self.transition(x)
        
        return x - x.detach() + x.detach()  # Gradient trick for stability


# ═══════════════════════════════════════════════════════════════
# Block and Model wrappers
# ═══════════════════════════════════════════════════════════════
class Block(nn.Module):
    def __init__(self, d: int, h: int, attn_type: str = "standard", **kwargs):
        super().__init__()
        self.ln1 = nn.LayerNorm(d)
        self.ln2 = nn.LayerNorm(d)
        
        if attn_type == "standard":
            self.attn = StandardAttention(d, h)
        elif attn_type == "multihop":
            self.attn = MultiHopAttention(d, h, num_hops=kwargs.get('num_hops', 3))
        elif attn_type == "slot":
            self.attn = SlotAttention(d, num_slots=kwargs.get('num_slots', 8))
        elif attn_type == "edge":
            self.attn = EdgeComputeAttention(d, h)
        elif attn_type == "memory":
            self.attn = MemoryAugmentedAttention(d, h, mem_size=kwargs.get('mem_size', 64))
        elif attn_type == "recurrent":
            self.attn = RecurrentDepthAttention(d, h, num_recur=kwargs.get('num_recur', 4))
        else:
            raise ValueError(f"Unknown attn_type: {attn_type}")
        
        self.ff = nn.Sequential(
            nn.Linear(d, 4 * d),
            nn.GELU(),
            nn.Linear(4 * d, d)
        )

    def forward(self, x, mask=None):
        x = x + self.attn(self.ln1(x), mask)
        x = x + self.ff(self.ln2(x))
        return x


class HeavyModel(nn.Module):
    def __init__(self, d: int, layers: int, h: int, attn_type: str = "standard", **kwargs):
        super().__init__()
        self.emb = nn.Embedding(VOCAB, d)
        self.blocks = nn.ModuleList([Block(d, h, attn_type, **kwargs) for _ in range(layers)])
        self.ln = nn.LayerNorm(d)
        self.head = nn.Linear(d, VOCAB, bias=False)
        self.head.weight = self.emb.weight  # Tie weights
        
    def forward(self, x, mask=None):
        x = self.emb(x)
        for blk in self.blocks:
            x = blk(x, mask)
        return self.head(self.ln(x))
    
    def count_params(self):
        return sum(p.numel() for p in self.parameters())


# ═══════════════════════════════════════════════════════════════
# Experiment Runner
# ═══════════════════════════════════════════════════════════════
def run_experiment(attn_type: str, d: int, layers: int, heads: int,
                   batch: int, seq: int, steps: int, **kwargs):
    print(f"\n{'='*60}")
    print(f"ATTENTION TYPE: {attn_type.upper()}")
    print(f"Config: d={d}, layers={layers}, heads={heads}")
    print(f"{'='*60}")
    
    model = HeavyModel(d, layers, heads, attn_type, **kwargs).to(DEV)
    print(f"Parameters: {model.count_params():,}")
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    mask = causal_mask(seq - 1)
    
    losses, times = [], []
    
    for step in range(steps):
        ids = torch.randint(0, VOCAB, (batch, seq), device=DEV)
        target = ids[:, 1:]
        input_ids = ids[:, :-1]
        
        start = time.time()
        optimizer.zero_grad()
        logits = model(input_ids, mask)
        loss = F.cross_entropy(logits.view(-1, VOCAB), target.reshape(-1))
        loss.backward()
        optimizer.step()
        elapsed = time.time() - start
        
        losses.append(loss.item())
        times.append(elapsed)
        tok_s = (batch * seq) / elapsed
        
        if step % 10 == 0 or step == steps - 1:
            print(f"Step {step:3d} | Loss: {loss.item():.4f} | {tok_s:.0f} tok/s | {elapsed*1000:.0f}ms")
    
    avg_loss = sum(losses[-20:]) / min(20, len(losses))
    avg_time = sum(times[-20:]) / min(20, len(times))
    avg_toks = (batch * seq) / avg_time
    
    return {
        "type": attn_type,
        "loss": avg_loss,
        "tok_s": avg_toks,
        "params": model.count_params()
    }


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--d", type=int, default=256)
    parser.add_argument("--layers", type=int, default=4)
    parser.add_argument("--heads", type=int, default=8)
    parser.add_argument("--batch", type=int, default=16)
    parser.add_argument("--seq", type=int, default=128)
    parser.add_argument("--steps", type=int, default=100)
    parser.add_argument("--types", type=str, default="all",
                        help="Comma-separated: standard,multihop,slot,edge,memory,recurrent")
    args = parser.parse_args()
    
    print(f"Device: {DEV}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name()}")
        print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    if args.types == "all":
        types = ["standard", "multihop", "slot", "edge", "memory", "recurrent"]
    else:
        types = [t.strip() for t in args.types.split(",")]
    
    results = []
    for t in types:
        try:
            r = run_experiment(t, args.d, args.layers, args.heads, 
                              args.batch, args.seq, args.steps)
            results.append(r)
        except Exception as e:
            print(f"ERROR in {t}: {e}")
            import traceback
            traceback.print_exc()
    
    # Summary
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    baseline = next((r for r in results if r['type'] == 'standard'), None)
    
    for r in results:
        rel = ""
        if baseline and r['type'] != 'standard':
            loss_diff = (baseline['loss'] - r['loss']) / baseline['loss'] * 100
            speed_ratio = r['tok_s'] / baseline['tok_s']
            rel = f" | vs baseline: {loss_diff:+.1f}% loss, {speed_ratio:.2f}x speed"
        print(f"{r['type']:12s} | Loss: {r['loss']:.4f} | {r['tok_s']:6.0f} tok/s | {r['params']:,} params{rel}")


if __name__ == "__main__":
    main()