"""
Head-to-head comparison: H4 attention vs softmax vs linear attention.
Same model size, same data, same training budget.

Usage:
    python compare_baselines.py                    # Shakespeare (default)
    python compare_baselines.py --dataset tinystories
    python compare_baselines.py --time-budget 60   # Faster runs
"""

import os
import sys
import math
import time
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from prepare_data import load_and_prepare
from baselines import BaselineLanguageModel
from h4_language_model import H4LanguageModel

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

# Model architecture (same for all models)
D_MODEL = 128
N_HEADS = 8
N_LAYERS = 4
D_VALUE = 16
D_FFN = 512
MAX_SEQ_LEN = 128
DROPOUT = 0.0

# Training
BATCH_SIZE = 8
LR = 5e-3
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 50
GRAD_CLIP = 1.0
TIME_BUDGET = 120  # seconds per model

# Eval
EVAL_INTERVAL = 25
EVAL_BATCHES = 5

# Models to compare
CONFIGS = [
    {'name': 'H4 Float',  'attention': 'h4',      'bitlinear': False},
    {'name': 'H4 Ternary', 'attention': 'h4',     'bitlinear': True},
    {'name': 'Softmax',    'attention': 'softmax', 'bitlinear': False},
    {'name': 'Linear',     'attention': 'linear',  'bitlinear': False},
]


def get_batch(data, batch_size, seq_len):
    """Sample a random batch of sequences."""
    max_start = len(data) - seq_len - 1
    if max_start <= 0:
        max_start = 1
    ix = torch.randint(0, max_start, (batch_size,))
    x = torch.stack([data[i:i + seq_len] for i in ix])
    y = torch.stack([data[i + 1:i + seq_len + 1] for i in ix])
    return x, y


def create_model(config, vocab_size):
    """Create a model based on config."""
    attn_type = config['attention']
    use_bitlinear = config['bitlinear']

    if attn_type == 'h4':
        model = H4LanguageModel(
            vocab_size=vocab_size,
            d_model=D_MODEL,
            n_heads=N_HEADS,
            n_layers=N_LAYERS,
            d_value=D_VALUE,
            d_ffn=D_FFN,
            top_k=16,
            max_seq_len=MAX_SEQ_LEN * 2,
            dropout=DROPOUT,
            use_bitlinear=use_bitlinear,
        )
    else:
        model = BaselineLanguageModel(
            vocab_size=vocab_size,
            d_model=D_MODEL,
            n_heads=N_HEADS,
            n_layers=N_LAYERS,
            d_value=D_VALUE,
            d_ffn=D_FFN,
            max_seq_len=MAX_SEQ_LEN * 2,
            dropout=DROPOUT,
            attention_type=attn_type,
            use_bitlinear=use_bitlinear,
        )
    return model


def train_and_evaluate(config, train_data, val_data, vocab_size, itos, time_budget):
    """Train a model and return evaluation metrics."""
    name = config['name']
    print(f"\n{'='*60}")
    print(f"Training: {name}")
    print(f"{'='*60}")

    torch.manual_seed(42)
    np.random.seed(42)

    model = create_model(config, vocab_size)
    param_info = model.count_params()
    print(f"  Parameters: {param_info['trainable']:,} trainable")

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=LR,
        weight_decay=WEIGHT_DECAY,
        betas=(0.9, 0.95),
    )

    def lr_schedule(step):
        if step < WARMUP_STEPS:
            return step / max(WARMUP_STEPS, 1)
        progress = (step - WARMUP_STEPS) / max(1, 500 - WARMUP_STEPS)
        return 0.1 + 0.9 * 0.5 * (1 + math.cos(math.pi * min(progress, 1.0)))

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_schedule)

    # H4 models use full attention (no tree) for short sequences
    is_h4 = config['attention'] == 'h4'

    step = 0
    total_training_time = 0.0
    best_val_loss = float('inf')
    model.train()

    t_start = time.time()

    while True:
        t0 = time.time()

        x, y = get_batch(train_data, BATCH_SIZE, MAX_SEQ_LEN)

        if is_h4:
            logits = model(x, use_tree=False)
        else:
            logits = model(x)
        loss = F.cross_entropy(logits.view(-1, vocab_size), y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        if GRAD_CLIP > 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
        scheduler.step()

        dt = time.time() - t0
        if step > 2:
            total_training_time += dt

        # Periodic eval
        if step % EVAL_INTERVAL == 0:
            model.eval()
            with torch.no_grad():
                vl = []
                for _ in range(EVAL_BATCHES):
                    xv, yv = get_batch(val_data, BATCH_SIZE, MAX_SEQ_LEN)
                    if is_h4:
                        vlogits = model(xv, use_tree=False)
                    else:
                        vlogits = model(xv)
                    vl.append(F.cross_entropy(vlogits.view(-1, vocab_size), yv.view(-1)).item())
                val_loss = sum(vl) / len(vl)
                if val_loss < best_val_loss:
                    best_val_loss = val_loss

            progress = min(total_training_time / time_budget, 1.0)
            print(f"  step {step:5d} | loss {loss.item():.4f} | val_loss {val_loss:.4f} | {progress:.0%}")
            model.train()

        step += 1
        if step > 2 and total_training_time >= time_budget:
            break

    # Final evaluation (more batches for stable estimate)
    model.eval()
    with torch.no_grad():
        vl = []
        for _ in range(EVAL_BATCHES * 4):
            xv, yv = get_batch(val_data, BATCH_SIZE, MAX_SEQ_LEN)
            if is_h4:
                vlogits = model(xv, use_tree=False)
            else:
                vlogits = model(xv)
            vl.append(F.cross_entropy(vlogits.view(-1, vocab_size), yv.view(-1)).item())
        final_val_loss = sum(vl) / len(vl)

    val_bpb = final_val_loss / math.log(2)
    perplexity = math.exp(final_val_loss)

    # Generate sample
    seed_ids = torch.tensor([[0, 1, 2, 3]], dtype=torch.long)
    if is_h4:
        gen = model.generate(seed_ids, max_new_tokens=60, temperature=0.8, top_k_sample=10)
    else:
        gen = model.generate(seed_ids, max_new_tokens=60, temperature=0.8, top_k_sample=10)
    gen_text = ''.join([itos.get(i.item(), '?') for i in gen[0]])

    wall_time = time.time() - t_start

    results = {
        'name': name,
        'attention': config['attention'],
        'bitlinear': config['bitlinear'],
        'params': param_info['trainable'],
        'steps': step,
        'val_loss': final_val_loss,
        'best_val_loss': best_val_loss,
        'val_bpb': val_bpb,
        'perplexity': perplexity,
        'wall_time': wall_time,
        'train_time': total_training_time,
        'sample': gen_text[:100],
    }

    print(f"  Final: val_loss={final_val_loss:.4f}, bpb={val_bpb:.4f}, "
          f"ppl={perplexity:.1f}, steps={step}, time={wall_time:.0f}s")

    return results


def print_comparison_table(all_results, dataset_name, time_budget=TIME_BUDGET):
    """Print a formatted comparison table."""
    print(f"\n{'='*80}")
    print(f"COMPARISON RESULTS — Dataset: {dataset_name}")
    print(f"Config: d_model={D_MODEL}, n_layers={N_LAYERS}, n_heads={N_HEADS}, "
          f"seq_len={MAX_SEQ_LEN}, budget={time_budget}s")
    print(f"{'='*80}")

    # Header
    print(f"{'Model':<16} {'Params':>8} {'Steps':>6} {'Val Loss':>9} "
          f"{'BPB':>7} {'PPL':>8} {'Time':>6}")
    print(f"{'-'*16} {'-'*8} {'-'*6} {'-'*9} {'-'*7} {'-'*8} {'-'*6}")

    # Sort by val_loss
    sorted_results = sorted(all_results, key=lambda r: r['val_loss'])

    for r in sorted_results:
        params_str = f"{r['params'] // 1000}K" if r['params'] >= 1000 else str(r['params'])
        print(f"{r['name']:<16} {params_str:>8} {r['steps']:>6} {r['val_loss']:>9.4f} "
              f"{r['val_bpb']:>7.4f} {r['perplexity']:>8.1f} {r['wall_time']:>5.0f}s")

    # Best model
    best = sorted_results[0]
    print(f"\nBest: {best['name']} (val_loss={best['val_loss']:.4f}, ppl={best['perplexity']:.1f})")

    # H4 vs Softmax comparison
    h4_float = next((r for r in all_results if r['attention'] == 'h4' and not r['bitlinear']), None)
    softmax = next((r for r in all_results if r['attention'] == 'softmax'), None)
    if h4_float and softmax:
        delta = softmax['val_loss'] - h4_float['val_loss']
        pct = (delta / softmax['val_loss']) * 100
        if delta > 0:
            print(f"H4 Float vs Softmax: H4 wins by {delta:.4f} nats ({pct:.1f}% better)")
        else:
            print(f"H4 Float vs Softmax: Softmax wins by {-delta:.4f} nats ({-pct:.1f}% better)")

    # Sample text from each model
    print(f"\n{'='*80}")
    print("GENERATED SAMPLES:")
    print(f"{'='*80}")
    for r in sorted_results:
        print(f"\n[{r['name']}]")
        print(f"  {r['sample']}")


def main():
    parser = argparse.ArgumentParser(description='Compare H4 vs baseline attention mechanisms')
    parser.add_argument('--dataset', default='shakespeare',
                        choices=['synthetic', 'shakespeare', 'tinystories'],
                        help='Dataset to use (default: shakespeare)')
    parser.add_argument('--time-budget', type=int, default=TIME_BUDGET,
                        help=f'Training time per model in seconds (default: {TIME_BUDGET})')
    parser.add_argument('--models', nargs='+', default=None,
                        help='Subset of models to run (e.g., "h4 softmax")')
    args = parser.parse_args()

    time_budget = args.time_budget

    print(f"H4 Polytopic Attention — Baseline Comparison")
    print(f"Dataset: {args.dataset}, Time budget: {time_budget}s per model")
    print(f"Expected total time: ~{len(CONFIGS) * time_budget // 60} minutes")

    # Load data
    train_data, val_data, vocab_size, stoi, itos = load_and_prepare(args.dataset)
    print(f"Vocab: {vocab_size}, Train: {len(train_data):,}, Val: {len(val_data):,}")

    # Filter configs if requested
    configs = CONFIGS
    if args.models:
        configs = [c for c in CONFIGS if any(m.lower() in c['name'].lower() for m in args.models)]
        if not configs:
            print(f"No matching models for {args.models}. Available: {[c['name'] for c in CONFIGS]}")
            return

    # Run comparisons
    all_results = []
    for config in configs:
        try:
            results = train_and_evaluate(
                config, train_data, val_data, vocab_size, itos, time_budget
            )
            all_results.append(results)
        except Exception as e:
            print(f"\n  ERROR training {config['name']}: {e}")
            import traceback
            traceback.print_exc()

    if all_results:
        print_comparison_table(all_results, args.dataset, time_budget)


if __name__ == '__main__':
    main()