"""
Benchmark Generation — Prefill + Decode across Goliath configs
==============================================================

Measures:
  - Prefill throughput (tok/s)
  - Decode throughput (tok/s) and per-token latency (ms)
  - VRAM usage (GB)

Configs tested:
  - Goliath FP4   (goliath_bits=4)
  - Goliath FP8   (goliath_bits=8)
  - Goliath Auto   (goliath_bits='auto')
  - Legacy path    (use_goliath=False)

Context lengths: 512, 2048, 8192

Usage:
    python3 benchmark_generation.py
"""

import gc
import sys
import time
import torch

# Ensure the kernel directory is importable
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from fireecho_kernel import FireEchoConfig, FireEchoEngine, _GOLIATH_AVAILABLE

if _GOLIATH_AVAILABLE:
    from goliath_kernel import _can_use_goliath_dot_scaled
else:
    _can_use_goliath_dot_scaled = None


# ============================================================================
# Engine Factory
# ============================================================================

def create_bench_engine(goliath_bits=4, use_goliath=True, num_layers=8):
    """Create a 7B-scale engine (reduced layers) with the given Goliath config."""
    config = FireEchoConfig(
        dim=4096,
        num_heads=32,
        num_kv_heads=8,
        num_layers=num_layers,
        vocab_size=32000,
        intermediate_size=11008,
        max_seq_len=16384,
        max_kv_blocks=1024,
        use_nvfp4=True,
        quantize_weights=True,
        goliath_bits=goliath_bits,
        use_goliath=use_goliath,
        use_hebbian=False,
        use_vision=False,
        use_audio=False,
    )
    engine = FireEchoEngine(config).cuda()
    engine.eval()
    return engine


# ============================================================================
# Benchmark Helpers
# ============================================================================

def bench_prefill(engine, seq_len, warmup=3, iters=5):
    """Benchmark prefill (forward pass on full prompt).

    Returns dict with ms, tok_s, vram_gb.
    """
    input_ids = torch.randint(0, 32000, (1, seq_len), device='cuda')

    # Warmup
    for _ in range(warmup):
        engine.reset_cache()
        with torch.no_grad():
            _ = engine(input_ids, use_cache=False)
    torch.cuda.synchronize()

    # Benchmark
    start_evt = torch.cuda.Event(enable_timing=True)
    end_evt = torch.cuda.Event(enable_timing=True)

    start_evt.record()
    for _ in range(iters):
        engine.reset_cache()
        with torch.no_grad():
            _ = engine(input_ids, use_cache=False)
    end_evt.record()
    torch.cuda.synchronize()

    ms = start_evt.elapsed_time(end_evt) / iters
    tok_s = seq_len / (ms / 1000.0)
    vram_gb = torch.cuda.memory_allocated() / 1e9

    return {'ms': ms, 'tok_s': tok_s, 'vram_gb': vram_gb}


def bench_decode(engine, prompt_len, num_decode_tokens=50, warmup=2):
    """Benchmark decode (token-by-token generation after prefill).

    Returns dict with total_ms, per_token_ms, tok_s, vram_gb.
    """
    prompt = torch.randint(0, 32000, (1, prompt_len), device='cuda')

    # Warmup
    for _ in range(warmup):
        engine.reset_cache()
        with torch.no_grad():
            _ = engine.generate(prompt, max_new_tokens=5, use_cache=False)
    torch.cuda.synchronize()

    gc.collect()
    torch.cuda.empty_cache()

    # Benchmark
    start_evt = torch.cuda.Event(enable_timing=True)
    end_evt = torch.cuda.Event(enable_timing=True)

    engine.reset_cache()
    start_evt.record()
    with torch.no_grad():
        output = engine.generate(prompt, max_new_tokens=num_decode_tokens,
                                 use_cache=False)
    end_evt.record()
    torch.cuda.synchronize()

    gen_tokens = output.shape[1] - prompt_len
    total_ms = start_evt.elapsed_time(end_evt)
    per_token_ms = total_ms / max(gen_tokens, 1)
    tok_s = gen_tokens / (total_ms / 1000.0) if total_ms > 0 else 0.0
    vram_gb = torch.cuda.memory_allocated() / 1e9

    return {
        'total_ms': total_ms,
        'per_token_ms': per_token_ms,
        'tok_s': tok_s,
        'vram_gb': vram_gb,
        'gen_tokens': gen_tokens,
    }


# ============================================================================
# Main Benchmark
# ============================================================================

def main():
    if not torch.cuda.is_available():
        print("CUDA not available.")
        return

    props = torch.cuda.get_device_properties(0)
    print("=" * 85)
    print("GENERATION BENCHMARK — Goliath FP4/FP8 Configs")
    print("=" * 85)
    print(f"GPU: {props.name}")
    print(f"VRAM: {props.total_memory / 1e9:.1f} GB")
    print(f"Goliath available: {_GOLIATH_AVAILABLE}")
    if _can_use_goliath_dot_scaled is not None:
        print(f"Goliath dot_scaled (native FP4 TCs): {_can_use_goliath_dot_scaled()}")
    print()

    configs = [
        ('Goliath FP4',  dict(goliath_bits=4,      use_goliath=True)),
        ('Goliath FP8',  dict(goliath_bits=8,      use_goliath=True)),
        ('Goliath Auto', dict(goliath_bits='auto', use_goliath=True)),
        ('Legacy path',  dict(goliath_bits=4,      use_goliath=False)),
    ]

    context_lengths = [512, 2048, 8192]

    # --- Prefill benchmark ---
    print("-" * 85)
    print("PREFILL BENCHMARK")
    print("-" * 85)
    header = f"{'Config':<16} | {'Ctx':>5} | {'Prefill ms':>11} | {'Prefill tok/s':>14} | {'VRAM GB':>8}"
    print(header)
    print("-" * len(header))

    for cfg_name, cfg_kwargs in configs:
        try:
            engine = create_bench_engine(**cfg_kwargs)
        except Exception as e:
            print(f"{cfg_name:<16} | {'ERROR':>5} | {str(e)[:40]}")
            continue

        for ctx in context_lengths:
            try:
                r = bench_prefill(engine, ctx)
                print(f"{cfg_name:<16} | {ctx:>5} | {r['ms']:>9.2f}ms | {r['tok_s']:>12,.0f} | {r['vram_gb']:>7.2f}")
            except Exception as e:
                print(f"{cfg_name:<16} | {ctx:>5} | ERROR: {str(e)[:30]}")

        del engine
        gc.collect()
        torch.cuda.empty_cache()

    # --- Decode benchmark ---
    print()
    print("-" * 85)
    print("DECODE BENCHMARK (50 tokens)")
    print("-" * 85)
    header = f"{'Config':<16} | {'Ctx':>5} | {'Decode tok/s':>13} | {'ms/token':>9} | {'VRAM GB':>8}"
    print(header)
    print("-" * len(header))

    for cfg_name, cfg_kwargs in configs:
        try:
            engine = create_bench_engine(**cfg_kwargs)
        except Exception as e:
            print(f"{cfg_name:<16} | {'ERROR':>5} | {str(e)[:40]}")
            continue

        for ctx in context_lengths:
            try:
                r = bench_decode(engine, ctx)
                print(f"{cfg_name:<16} | {ctx:>5} | {r['tok_s']:>11,.1f} | {r['per_token_ms']:>7.2f}ms | {r['vram_gb']:>7.2f}")
            except Exception as e:
                print(f"{cfg_name:<16} | {ctx:>5} | ERROR: {str(e)[:30]}")

        del engine
        gc.collect()
        torch.cuda.empty_cache()

    print()
    print("=" * 85)
    print("BENCHMARK COMPLETE")
    print("=" * 85)


if __name__ == "__main__":
    main()