#!/usr/bin/env python3
"""
╔══════════════════════════════════════════════════════════════════════════════╗
║                                                                              ║
║   📊 Paris MoE - Comprehensive Benchmarking Utility 📊                       ║
║                                                                              ║
║   Measures performance across precision modes, batch sizes, and configs.     ║
║   Outputs results as both terminal display and Markdown file.                ║
║                                                                              ║
╚══════════════════════════════════════════════════════════════════════════════╝

Usage:
    python benchmark.py                          # Run all benchmarks
    python benchmark.py --quick                  # Quick benchmark (fewer configs)
    python benchmark.py --precision bf16         # Benchmark specific precision
    python benchmark.py --output results.md      # Save results to file
"""

import argparse
import sys
import os
import time
import gc
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass
from typing import List, Dict, Optional

SCRIPT_DIR = Path(__file__).parent.absolute()
SRC_DIR = SCRIPT_DIR / "src"
sys.path.insert(0, str(SRC_DIR))

import torch

# ═══════════════════════════════════════════════════════════════════════════════
#                              DATA STRUCTURES
# ═══════════════════════════════════════════════════════════════════════════════

@dataclass
class BenchmarkResult:
    """Single benchmark result."""
    precision: str
    num_samples: int
    num_steps: int
    topk: int
    offload: int
    
    load_time: float  # Model loading time (seconds)
    gen_time: float   # Generation time (seconds)
    decode_time: float  # VAE decoding time (seconds)
    
    peak_memory_gb: float  # Peak GPU memory usage
    
    @property
    def total_time(self) -> float:
        return self.gen_time + self.decode_time
    
    @property
    def throughput(self) -> float:
        """Images per second (generation only)."""
        return self.num_samples / self.gen_time if self.gen_time > 0 else 0
    
    @property
    def time_per_step(self) -> float:
        """Seconds per sampling step."""
        return self.gen_time / self.num_steps if self.num_steps > 0 else 0
    
    @property
    def time_per_image(self) -> float:
        """Seconds per image (generation only)."""
        return self.gen_time / self.num_samples if self.num_samples > 0 else 0


# ═══════════════════════════════════════════════════════════════════════════════
#                              BENCHMARK RUNNER
# ═══════════════════════════════════════════════════════════════════════════════

def get_gpu_memory_gb() -> float:
    """Get current GPU memory usage in GB."""
    if torch.cuda.is_available():
        return torch.cuda.max_memory_allocated() / (1024 ** 3)
    return 0.0


def reset_gpu_memory():
    """Reset GPU memory tracking."""
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.empty_cache()
        gc.collect()


def run_single_benchmark(precision: str, num_samples: int, num_steps: int,
                          topk: int, offload: int, device: str = 'cuda') -> BenchmarkResult:
    """Run a single benchmark configuration."""
    from generate import load_sampler
    
    reset_gpu_memory()
    
    # Load model
    start_load = time.time()
    sampler = load_sampler(precision=precision, device=device, offload=offload)
    load_time = time.time() - start_load
    
    # Set seed for reproducibility
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(42)
    
    # Warmup run
    _ = sampler.sample(
        num_samples=1,
        text_prompts=["warmup"],
        cfg_scale=7.5,
        num_steps=2,
        use_bf16=(precision == 'bf16'),
        topk=topk
    )
    
    reset_gpu_memory()
    torch.cuda.synchronize()
    
    # Timed generation
    start_gen = time.time()
    latents = sampler.sample(
        num_samples=num_samples,
        text_prompts=["a cute cat"],
        cfg_scale=7.5,
        num_steps=num_steps,
        use_bf16=(precision == 'bf16'),
        topk=topk
    )
    torch.cuda.synchronize()
    gen_time = time.time() - start_gen
    
    # Timed decoding
    start_decode = time.time()
    images = sampler.vae_manager.decode(latents)
    torch.cuda.synchronize()
    decode_time = time.time() - start_decode
    
    peak_memory = get_gpu_memory_gb()
    
    # Cleanup
    del sampler, latents, images
    gc.collect()
    torch.cuda.empty_cache()
    
    return BenchmarkResult(
        precision=precision,
        num_samples=num_samples,
        num_steps=num_steps,
        topk=topk,
        offload=offload,
        load_time=load_time,
        gen_time=gen_time,
        decode_time=decode_time,
        peak_memory_gb=peak_memory
    )


# ═══════════════════════════════════════════════════════════════════════════════
#                              OUTPUT FORMATTERS
# ═══════════════════════════════════════════════════════════════════════════════

def format_terminal_results(results: List[BenchmarkResult], gpu_name: str) -> str:
    """Format results for terminal display."""
    lines = []
    
    lines.append("""
╔══════════════════════════════════════════════════════════════════════════════╗
║                     📊 PARIS MoE BENCHMARK RESULTS 📊                        ║
╚══════════════════════════════════════════════════════════════════════════════╝
    """)
    
    lines.append(f"  GPU: {gpu_name}")
    lines.append(f"  Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    lines.append("")
    
    # Group by precision
    precisions = sorted(set(r.precision for r in results))
    
    for precision in precisions:
        prec_results = [r for r in results if r.precision == precision]
        
        lines.append(f"┌{'─'*78}┐")
        lines.append(f"│  {precision.upper()} Precision{' '*65}│")
        lines.append(f"├{'─'*78}┤")
        lines.append(f"│ {'Samples':>8} │ {'Steps':>6} │ {'TopK':>5} │ {'Offload':>7} │ "
                    f"{'Gen(s)':>8} │ {'Img/s':>6} │ {'s/step':>6} │ {'Mem(GB)':>8} │")
        lines.append(f"├{'─'*78}┤")
        
        for r in prec_results:
            lines.append(
                f"│ {r.num_samples:>8} │ {r.num_steps:>6} │ {r.topk:>5} │ {r.offload:>7} │ "
                f"{r.gen_time:>8.2f} │ {r.throughput:>6.2f} │ {r.time_per_step:>6.3f} │ "
                f"{r.peak_memory_gb:>8.2f} │"
            )
        
        lines.append(f"└{'─'*78}┘")
        lines.append("")
    
    # Summary
    if results:
        fastest = min(results, key=lambda r: r.time_per_image)
        most_efficient = min(results, key=lambda r: r.peak_memory_gb)
        
        lines.append("┌─────────────────────────────────────────────────────────────────┐")
        lines.append("│                          📈 SUMMARY                             │")
        lines.append("├─────────────────────────────────────────────────────────────────┤")
        lines.append(f"│  🏆 Fastest:         {fastest.precision.upper():>6} @ {fastest.throughput:.2f} img/s              │")
        lines.append(f"│  💾 Most Efficient:  {most_efficient.precision.upper():>6} @ {most_efficient.peak_memory_gb:.1f} GB peak          │")
        lines.append("└─────────────────────────────────────────────────────────────────┘")
    
    return "\n".join(lines)


def format_markdown_results(results: List[BenchmarkResult], gpu_name: str) -> str:
    """Format results as Markdown."""
    lines = []
    
    lines.append("# 📊 Paris MoE Benchmark Results")
    lines.append("")
    lines.append(f"**GPU:** {gpu_name}")
    lines.append(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    lines.append("")
    
    lines.append("## 🏗️ Model Architecture")
    lines.append("")
    lines.append("| Component | Details |")
    lines.append("|-----------|---------|")
    lines.append("| Experts | 8× DiT-XL/2 (606M params each) |")
    lines.append("| Router | DiT-B/2 (129M params) |")
    lines.append("| Total | ~5 Billion parameters |")
    lines.append("| VAE | SD-VAE (stabilityai/sd-vae-ft-mse) |")
    lines.append("| Text Encoder | CLIP ViT-L/14 |")
    lines.append("")
    
    # Group by precision
    precisions = sorted(set(r.precision for r in results))
    
    for precision in precisions:
        prec_results = [r for r in results if r.precision == precision]
        
        lines.append(f"## {precision.upper()} Precision")
        lines.append("")
        lines.append("| Samples | Steps | TopK | Offload | Gen Time (s) | Throughput (img/s) | Time/Step (s) | Peak Memory (GB) |")
        lines.append("|---------|-------|------|---------|--------------|-------------------|---------------|------------------|")
        
        for r in prec_results:
            lines.append(
                f"| {r.num_samples} | {r.num_steps} | {r.topk} | {r.offload} | "
                f"{r.gen_time:.2f} | {r.throughput:.2f} | {r.time_per_step:.3f} | {r.peak_memory_gb:.2f} |"
            )
        
        lines.append("")
    
    # Summary
    if results:
        lines.append("## 📈 Summary")
        lines.append("")
        
        fastest = min(results, key=lambda r: r.time_per_image)
        most_efficient = min(results, key=lambda r: r.peak_memory_gb)
        
        lines.append(f"- **🏆 Fastest Configuration:** {fastest.precision.upper()}, "
                    f"{fastest.num_samples} samples @ {fastest.throughput:.2f} img/s")
        lines.append(f"- **💾 Most Memory Efficient:** {most_efficient.precision.upper()} "
                    f"with offload={most_efficient.offload} @ {most_efficient.peak_memory_gb:.1f} GB peak")
        lines.append("")
        
        # Recommendations
        lines.append("## 🎯 Recommendations")
        lines.append("")
        lines.append("| Use Case | Precision | Offload | Expected Performance |")
        lines.append("|----------|-----------|---------|---------------------|")
        
        bf16_results = [r for r in results if r.precision == 'bf16' and r.offload == 0]
        if bf16_results:
            r = bf16_results[0]
            lines.append(f"| **Production (Quality)** | BF16 | 0 | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |")
        
        int8_results = [r for r in results if r.precision == 'int8' and r.offload == 0]
        if int8_results:
            r = int8_results[0]
            lines.append(f"| **Balanced** | INT8 | 0 | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |")
        
        offload_results = [r for r in results if r.offload > 0]
        if offload_results:
            r = min(offload_results, key=lambda x: x.peak_memory_gb)
            lines.append(f"| **Low VRAM** | {r.precision.upper()} | {r.offload} | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |")
    
    lines.append("")
    lines.append("---")
    lines.append("*Generated by Paris MoE Benchmark Utility*")
    
    return "\n".join(lines)


# ═══════════════════════════════════════════════════════════════════════════════
#                              MAIN
# ═══════════════════════════════════════════════════════════════════════════════

def parse_args():
    parser = argparse.ArgumentParser(
        description="📊 Paris MoE - Benchmark Utility",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python benchmark.py                      # Full benchmark suite
  python benchmark.py --quick              # Quick benchmark
  python benchmark.py --precision bf16     # BF16 only
  python benchmark.py --output results.md  # Save to file
        """
    )
    
    parser.add_argument("--quick", action="store_true",
                        help="Run quick benchmark with fewer configurations")
    parser.add_argument("--precision", type=str, default=None,
                        choices=["bf16", "int8", "mixed"],
                        help="Benchmark specific precision only")
    parser.add_argument("--output", "-o", type=str, default=None,
                        help="Output Markdown file path")
    parser.add_argument("--samples", type=int, default=None,
                        help="Override number of samples")
    parser.add_argument("--steps", type=int, default=None,
                        help="Override number of steps")
    
    return parser.parse_args()


def get_benchmark_configs(args) -> List[Dict]:
    """Get list of benchmark configurations to run."""
    configs = []
    
    if args.quick:
        # Quick benchmark: minimal configs
        precisions = [args.precision] if args.precision else ['bf16', 'int8']
        samples = args.samples or 4
        steps = args.steps or 10
        
        for precision in precisions:
            configs.append({
                'precision': precision,
                'num_samples': samples,
                'num_steps': steps,
                'topk': 1,
                'offload': 0
            })
    else:
        # Full benchmark suite
        precisions = [args.precision] if args.precision else ['bf16', 'int8']
        samples_list = [args.samples] if args.samples else [4, 16]
        steps_list = [args.steps] if args.steps else [20, 30]
        topk_list = [1, 2]
        offload_list = [0, 4]
        
        for precision in precisions:
            for samples in samples_list:
                for steps in steps_list:
                    for topk in topk_list:
                        for offload in offload_list:
                            configs.append({
                                'precision': precision,
                                'num_samples': samples,
                                'num_steps': steps,
                                'topk': topk,
                                'offload': offload
                            })
    
    return configs


def main():
    args = parse_args()
    
    print("""
╔══════════════════════════════════════════════════════════════════════════════╗
║                                                                              ║
║   📊 Paris MoE - Comprehensive Benchmarking Utility 📊                       ║
║                                                                              ║
║   Measuring performance across precision modes, batch sizes, and configs.    ║
║                                                                              ║
╚══════════════════════════════════════════════════════════════════════════════╝
    """)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device != "cuda":
        print("⚠️  Warning: Running on CPU. Benchmarks will be slow.")
    
    gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
    print(f"🖥️  Device: {gpu_name}")
    
    configs = get_benchmark_configs(args)
    print(f"📋 Running {len(configs)} benchmark configurations...\n")
    
    results = []
    
    for i, config in enumerate(configs):
        print(f"[{i+1}/{len(configs)}] {config['precision'].upper()} | "
              f"{config['num_samples']} samples | {config['num_steps']} steps | "
              f"Top-{config['topk']} | Offload {config['offload']}")
        
        try:
            result = run_single_benchmark(
                precision=config['precision'],
                num_samples=config['num_samples'],
                num_steps=config['num_steps'],
                topk=config['topk'],
                offload=config['offload'],
                device=device
            )
            results.append(result)
            print(f"    ✅ {result.gen_time:.2f}s, {result.throughput:.2f} img/s, "
                  f"{result.peak_memory_gb:.1f} GB peak")
        except Exception as e:
            print(f"    ❌ Failed: {e}")
        
        print()
    
    if not results:
        print("❌ No successful benchmarks!")
        return 1
    
    # Print terminal results
    terminal_output = format_terminal_results(results, gpu_name)
    print(terminal_output)
    
    # Save Markdown if requested
    if args.output:
        md_output = format_markdown_results(results, gpu_name)
        with open(args.output, 'w') as f:
            f.write(md_output)
        print(f"\n✅ Results saved to: {args.output}")
    
    return 0


if __name__ == "__main__":
    exit(main())