#!/usr/bin/env python3
"""
K-Simplex Language Model - Inference Script

Loads a trained k-simplex LLM checkpoint and generates text using
geometrically-validated autoregressive sampling.

Usage:
    python inference.py --checkpoint checkpoint_epoch_008.pt --prompt "ROMEO: "
    python inference.py --repo AbstractPhil/ksimplex-llm-prototype --prompt "To be or not"
"""

import argparse
import json
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken
from pathlib import Path
from huggingface_hub import hf_hub_download


# =============================================================================
# GEOMETRIC CORE
# =============================================================================

def factorial(n: int) -> int:
    return math.factorial(n)


def cayley_menger_volume_squared(vertices: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Compute squared volume via Cayley-Menger determinant.
    
    Args:
        vertices: [*, nv, edim] vertex coordinates
        
    Returns:
        d2: [*, n_pairs] squared distances
        vol2: [*] squared volume
    """
    nv = vertices.shape[-2]
    k = nv - 1  # simplex dimension
    
    # Pairwise squared distances
    diff = vertices.unsqueeze(-2) - vertices.unsqueeze(-3)  # [*, nv, nv, edim]
    d2_matrix = (diff ** 2).sum(-1)  # [*, nv, nv]
    
    # Extract upper triangle (pairs)
    idx = torch.triu_indices(nv, nv, offset=1)
    d2 = d2_matrix[..., idx[0], idx[1]]  # [*, n_pairs]
    
    # Build Cayley-Menger matrix
    batch_shape = vertices.shape[:-2]
    size = nv + 1
    cm = torch.zeros(*batch_shape, size, size, device=vertices.device, dtype=vertices.dtype)
    
    # First row/col: [0, 1, 1, ..., 1]
    cm[..., 0, 1:] = 1.0
    cm[..., 1:, 0] = 1.0
    
    # Fill distance submatrix
    cm[..., 1:, 1:] = d2_matrix
    
    # Diagonal of distance submatrix is 0 (already set)
    
    # Determinant
    det = torch.linalg.det(cm)
    
    # Volume formula: Vol² = (-1)^(k+1) * det(CM) / (2^k * (k!)²)
    sign = (-1) ** (k + 1)
    denom = (2 ** k) * (factorial(k) ** 2)
    vol2 = sign * det / denom
    
    return d2, vol2


# =============================================================================
# MODEL COMPONENTS
# =============================================================================

class SimplexTemplate(nn.Module):
    """Generates regular simplex template vertices."""
    
    def __init__(self, k: int, edim: int, scale: float = 1.0):
        super().__init__()
        self.k = k
        self.nv = k + 1
        self.edim = edim
        
        # Regular simplex vertices (equilateral)
        vertices = torch.zeros(self.nv, edim)
        for i in range(self.nv):
            angle = 2 * math.pi * i / self.nv
            vertices[i, 0] = scale * math.cos(angle)
            if edim > 1:
                vertices[i, 1] = scale * math.sin(angle)
            if edim > 2:
                vertices[i, 2] = scale * 0.3 * math.cos(angle * 2)
            for d in range(3, edim):
                vertices[i, d] = scale * 0.1 * math.sin(angle * (d + 1))
        
        self.register_buffer('template', vertices)
    
    def forward(self) -> torch.Tensor:
        return self.template


class KSimplexChannel(nn.Module):
    """Single k-simplex channel with geometric validation."""
    
    def __init__(self, k: int, edim: int, hidden: int, feat_dim: int, base_deform: float = 0.05):
        super().__init__()
        self.k = k
        self.nv = k + 1
        self.edim = edim
        self.feat_dim = feat_dim
        self.base_deform = base_deform
        
        # Template
        self.template = SimplexTemplate(k, edim)
        
        # Projections
        self._to_coords = nn.Linear(hidden, self.nv * edim)
        self._to_feats = nn.Linear(hidden, self.nv * feat_dim)
        
        # Geometry dimension: n_pairs + 1 (vol²)
        n_pairs = (self.nv * (self.nv - 1)) // 2
        self.geo_dim = n_pairs + 1
        
        # Geometric gate
        self._geo_gate = nn.Sequential(
            nn.Linear(self.geo_dim, feat_dim),
            nn.Sigmoid()
        )
    
    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Args:
            x: [*, hidden]
            
        Returns:
            out: [*, feat_dim + geo_dim] gated features + geometry
            vol2: [*] squared volume for validity loss
            mean_d2: [*] mean squared distance
        """
        # Vertex coordinates
        coords = self._to_coords(x).unflatten(-1, (self.nv, self.edim))
        verts = self.template() + self.base_deform * coords
        
        # Vertex features
        vert_feats = self._to_feats(x).unflatten(-1, (self.nv, self.feat_dim))
        
        # Cayley-Menger
        d2, vol2 = cayley_menger_volume_squared(verts)
        
        # Geometry vector
        geo = torch.cat([d2, vol2.unsqueeze(-1)], dim=-1)
        
        # Gate features by geometry
        gate = self._geo_gate(geo)
        validity = torch.sigmoid(vol2 * 1e6).unsqueeze(-1)
        
        # Aggregate vertex features
        feat_agg = vert_feats.mean(dim=-2) * gate * validity
        
        # Output
        out = torch.cat([feat_agg, geo], dim=-1)
        
        return out, vol2, d2.mean(dim=-1)


class TokenToKChannels(nn.Module):
    """Project token embeddings to k-simplex channels."""
    
    def __init__(self, embed_dim: int, hidden: int, depth: int, edim: int, feat_dim: int):
        super().__init__()
        self.depth = depth
        
        self._proj = nn.Linear(embed_dim, hidden)
        self._channels = nn.ModuleList([
            KSimplexChannel(k=k+1, edim=edim, hidden=hidden, feat_dim=feat_dim)
            for k in range(depth)
        ])
        
        # Compute output dimension (max across k-levels, then pad)
        self.out_dims = [ch.feat_dim + ch.geo_dim for ch in self._channels]
        self.max_dim = max(self.out_dims)
        
        # Padding projections to equalize dimensions
        self._pads = nn.ModuleList([
            nn.Linear(d, self.max_dim) if d != self.max_dim else nn.Identity()
            for d in self.out_dims
        ])
    
    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, list[torch.Tensor], list[torch.Tensor]]:
        """
        Args:
            x: [B, T, embed_dim]
            
        Returns:
            out: [B, T, K, max_dim]
            vol2_list: list of [B, T] per k
            d2_list: list of [B, T] per k
        """
        h = self._proj(x)  # [B, T, hidden]
        
        outputs = []
        vol2_list = []
        d2_list = []
        
        for ch, pad in zip(self._channels, self._pads):
            out, vol2, d2 = ch(h)
            outputs.append(pad(out))
            vol2_list.append(vol2)
            d2_list.append(d2)
        
        # Stack: [B, T, K, max_dim]
        out = torch.stack(outputs, dim=-2)
        
        return out, vol2_list, d2_list


class KChannelCrossAttention(nn.Module):
    """Cross-attention between k-levels at each position."""
    
    def __init__(self, dim: int, num_heads: int = 4, dropout: float = 0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(dim, num_heads, dropout=dropout, batch_first=True)
        self.norm = nn.LayerNorm(dim)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: [B, T, K, D]
        Returns:
            [B, T, K, D]
        """
        B, T, K, D = x.shape
        
        # Reshape to [B*T, K, D] - attention across K dimension
        x_flat = x.view(B * T, K, D)
        
        # Self-attention across k-levels
        attn_out, _ = self.attn(x_flat, x_flat, x_flat)
        
        # Residual + norm
        out = self.norm(x_flat + attn_out)
        
        return out.view(B, T, K, D)


class CausalSequenceAttention(nn.Module):
    """Causal attention across sequence positions."""
    
    def __init__(self, dim: int, num_heads: int, max_seq_len: int, dropout: float = 0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(dim, num_heads, dropout=dropout, batch_first=True)
        self.norm = nn.LayerNorm(dim)
        
        # Causal mask
        mask = torch.tril(torch.ones(max_seq_len, max_seq_len)).bool()
        self.register_buffer('_causal_mask', mask)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: [B, T, K, D]
        Returns:
            [B, T, K, D]
        """
        B, T, K, D = x.shape
        
        # Flatten K into D: [B, T, K*D]
        x_flat = x.view(B, T, K * D)
        
        # Causal mask
        mask = self._causal_mask[:T, :T]
        attn_mask = ~mask  # True = masked
        
        # Self-attention across sequence
        attn_out, _ = self.attn(
            x_flat, x_flat, x_flat,
            attn_mask=attn_mask.float().masked_fill(attn_mask, float('-inf'))
        )
        
        # Residual + norm
        out = self.norm(x_flat + attn_out)
        
        return out.view(B, T, K, D)


class GeoBlock(nn.Module):
    """Geometric block: k-channel attention + causal sequence attention + MLP."""
    
    def __init__(self, dim: int, num_heads: int, max_seq_len: int, depth: int, dropout: float = 0.1):
        super().__init__()
        self.k_attn = KChannelCrossAttention(dim, num_heads=4, dropout=dropout)
        self.seq_attn = CausalSequenceAttention(dim, num_heads, max_seq_len, dropout)
        
        self.mlp = nn.Sequential(
            nn.Linear(dim * depth, dim * depth * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim * depth * 4, dim * depth),
            nn.Dropout(dropout),
        )
        self.mlp_norm = nn.LayerNorm(dim * depth)
        self.depth = depth
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: [B, T, K, D]
        Returns:
            [B, T, K, D]
        """
        # K-channel attention
        x = self.k_attn(x)
        
        # Sequence attention
        x = self.seq_attn(x)
        
        # MLP on flattened k-channels
        B, T, K, D = x.shape
        x_flat = x.view(B, T, K * D)
        x_flat = self.mlp_norm(x_flat + self.mlp(x_flat))
        
        return x_flat.view(B, T, K, D)


class KSimplexLM(nn.Module):
    """K-Simplex Language Model."""
    
    def __init__(
        self,
        vocab_size: int = 50257,
        max_seq_len: int = 256,
        embed_dim: int = 384,
        depth: int = 4,
        edim: int = 16,
        feat_dim: int = 96,
        hidden: int = 384,
        num_heads: int = 8,
        num_blocks: int = 8,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.max_seq_len = max_seq_len
        self.depth = depth
        
        # Token embedding
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Embedding(max_seq_len, embed_dim)
        self.embed_drop = nn.Dropout(dropout)
        
        # Token to k-channels
        self.to_k_channels = TokenToKChannels(embed_dim, hidden, depth, edim, feat_dim)
        
        # Geometric blocks
        k_dim = self.to_k_channels.max_dim
        self.blocks = nn.ModuleList([
            GeoBlock(k_dim, num_heads, max_seq_len, depth, dropout)
            for _ in range(num_blocks)
        ])
        
        # LM head
        self.ln_f = nn.LayerNorm(k_dim * depth)
        self.lm_head = nn.Linear(k_dim * depth, vocab_size, bias=False)
        
        # Weight tying
        # self.lm_head.weight = self.embed.weight  # Optional
        
        self._init_weights()
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, std=0.02)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, std=0.02)
    
    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, dict]:
        """
        Args:
            x: [B, T] token indices
            
        Returns:
            logits: [B, T, vocab_size]
            geo_info: dict with vol2, d2 per k-level
        """
        B, T = x.shape
        
        # Embeddings
        pos = torch.arange(T, device=x.device).unsqueeze(0)
        h = self.embed(x) + self.pos_embed(pos)
        h = self.embed_drop(h)
        
        # To k-channels
        h, vol2_list, d2_list = self.to_k_channels(h)
        
        # Geo blocks
        for block in self.blocks:
            h = block(h)
        
        # LM head
        h_flat = h.view(B, T, -1)
        h_flat = self.ln_f(h_flat)
        logits = self.lm_head(h_flat)
        
        geo_info = {
            'vol2': vol2_list,
            'd2': d2_list,
        }
        
        return logits, geo_info


# =============================================================================
# INFERENCE UTILITIES
# =============================================================================

def load_model(
    checkpoint_path: str = None,
    repo_id: str = None,
    device: str = None,
) -> tuple[KSimplexLM, tiktoken.Encoding]:
    """
    Load model from checkpoint or HuggingFace Hub.
    
    Args:
        checkpoint_path: Local path to checkpoint
        repo_id: HuggingFace repo ID (e.g., "AbstractPhil/ksimplex-llm-prototype")
        device: Device to load to
        
    Returns:
        model: KSimplexLM
        tokenizer: tiktoken encoding
    """
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Load checkpoint
    if repo_id:
        checkpoint_path = hf_hub_download(repo_id, "checkpoint_latest.pt")
        config_path = hf_hub_download(repo_id, "config.json")
        with open(config_path) as f:
            config = json.load(f)
    elif checkpoint_path:
        checkpoint = torch.load(checkpoint_path, map_location=device)
        config = checkpoint.get('config', {}).get('model', {})
    else:
        raise ValueError("Must provide checkpoint_path or repo_id")
    
    # Build model
    model = KSimplexLM(
        vocab_size=config.get('vocab_size', 50257),
        max_seq_len=config.get('max_seq_len', 256),
        embed_dim=config.get('embed_dim', 384),
        depth=config.get('depth', 4),
        edim=config.get('edim', 16),
        feat_dim=config.get('feat_dim', 96),
        hidden=config.get('hidden', 384),
        num_heads=config.get('num_heads', 8),
        num_blocks=config.get('num_blocks', 8),
        dropout=0.0,  # No dropout at inference
    )
    
    # Load weights
    if repo_id:
        checkpoint = torch.load(checkpoint_path, map_location=device)
    state_dict = checkpoint.get('model_state_dict', checkpoint)
    model.load_state_dict(state_dict)
    
    model.to(device)
    model.eval()
    
    # Tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")
    
    return model, tokenizer


@torch.no_grad()
def generate(
    model: KSimplexLM,
    tokenizer: tiktoken.Encoding,
    prompt: str,
    max_tokens: int = 100,
    temperature: float = 0.8,
    top_k: int = 50,
    top_p: float = 0.9,
    device: str = None,
) -> str:
    """
    Generate text from prompt.
    
    Args:
        model: KSimplexLM model
        tokenizer: tiktoken encoding
        prompt: Input text prompt
        max_tokens: Maximum tokens to generate
        temperature: Sampling temperature
        top_k: Top-k sampling
        top_p: Nucleus sampling threshold
        device: Device
        
    Returns:
        Generated text including prompt
    """
    if device is None:
        device = next(model.parameters()).device
    
    # Encode prompt
    tokens = tokenizer.encode(prompt)
    tokens = torch.tensor([tokens], dtype=torch.long, device=device)
    
    # Generate
    for _ in range(max_tokens):
        # Truncate to max_seq_len
        if tokens.shape[1] > model.max_seq_len:
            tokens = tokens[:, -model.max_seq_len:]
        
        # Forward
        logits, geo_info = model(tokens)
        logits = logits[:, -1, :]  # Last position
        
        # Temperature
        logits = logits / temperature
        
        # Top-k
        if top_k > 0:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = float('-inf')
        
        # Top-p (nucleus)
        if top_p < 1.0:
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
            
            # Remove tokens with cumulative probability above threshold
            sorted_indices_to_remove = cumulative_probs > top_p
            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
            sorted_indices_to_remove[..., 0] = 0
            
            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
            logits[indices_to_remove] = float('-inf')
        
        # Sample
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        
        # Append
        tokens = torch.cat([tokens, next_token], dim=1)
        
        # Stop on EOS (optional)
        if next_token.item() == tokenizer.eot_token:
            break
    
    # Decode
    return tokenizer.decode(tokens[0].tolist())


@torch.no_grad()
def analyze_geometry(
    model: KSimplexLM,
    tokenizer: tiktoken.Encoding,
    text: str,
    device: str = None,
) -> dict:
    """
    Analyze geometric properties of text encoding.
    
    Args:
        model: KSimplexLM model
        tokenizer: tiktoken encoding
        text: Input text
        device: Device
        
    Returns:
        Dictionary with geometric statistics
    """
    if device is None:
        device = next(model.parameters()).device
    
    tokens = tokenizer.encode(text)
    tokens = torch.tensor([tokens], dtype=torch.long, device=device)
    
    _, geo_info = model(tokens)
    
    stats = {}
    for k, (vol2, d2) in enumerate(zip(geo_info['vol2'], geo_info['d2']), 1):
        vol2_np = vol2.cpu().numpy()
        d2_np = d2.cpu().numpy()
        
        stats[f'k{k}'] = {
            'vol2_mean': float(vol2_np.mean()),
            'vol2_std': float(vol2_np.std()),
            'vol2_min': float(vol2_np.min()),
            'vol2_max': float(vol2_np.max()),
            'validity_rate': float((vol2_np > 0).mean()),
            'd2_mean': float(d2_np.mean()),
        }
    
    return stats


# =============================================================================
# CLI
# =============================================================================

def main():
    parser = argparse.ArgumentParser(description='K-Simplex LLM Inference')
    parser.add_argument('--checkpoint', type=str, help='Path to checkpoint file')
    parser.add_argument('--repo', type=str, default='AbstractPhil/ksimplex-llm-prototype',
                        help='HuggingFace repo ID')
    parser.add_argument('--prompt', type=str, default='ROMEO: ',
                        help='Text prompt')
    parser.add_argument('--max_tokens', type=int, default=100,
                        help='Maximum tokens to generate')
    parser.add_argument('--temperature', type=float, default=0.8,
                        help='Sampling temperature')
    parser.add_argument('--top_k', type=int, default=50,
                        help='Top-k sampling')
    parser.add_argument('--top_p', type=float, default=0.9,
                        help='Nucleus sampling threshold')
    parser.add_argument('--analyze', action='store_true',
                        help='Analyze geometric properties instead of generating')
    
    args = parser.parse_args()
    
    print("Loading model...")
    model, tokenizer = load_model(
        checkpoint_path=args.checkpoint,
        repo_id=args.repo if not args.checkpoint else None,
    )
    print(f"Model loaded on {next(model.parameters()).device}")
    
    if args.analyze:
        print(f"\nAnalyzing: {args.prompt}")
        stats = analyze_geometry(model, tokenizer, args.prompt)
        for k, kstats in stats.items():
            print(f"\n{k}:")
            for name, value in kstats.items():
                print(f"  {name}: {value:.6f}")
    else:
        print(f"\nGenerating from: {args.prompt}")
        text = generate(
            model, tokenizer, args.prompt,
            max_tokens=args.max_tokens,
            temperature=args.temperature,
            top_k=args.top_k,
            top_p=args.top_p,
        )
        print("\n" + "=" * 60)
        print(text)
        print("=" * 60)


if __name__ == '__main__':
    main()