#!/usr/bin/env python3 """ ╔══════════════════════════════════════════════════════════════════════════════╗ ║ ║ ║ 📊 Paris MoE - Comprehensive Benchmarking Utility 📊 ║ ║ ║ ║ Measures performance across precision modes, batch sizes, and configs. ║ ║ Outputs results as both terminal display and Markdown file. ║ ║ ║ ╚══════════════════════════════════════════════════════════════════════════════╝ Usage: python benchmark.py # Run all benchmarks python benchmark.py --quick # Quick benchmark (fewer configs) python benchmark.py --precision bf16 # Benchmark specific precision python benchmark.py --output results.md # Save results to file """ import argparse import sys import os import time import gc from pathlib import Path from datetime import datetime from dataclasses import dataclass from typing import List, Dict, Optional SCRIPT_DIR = Path(__file__).parent.absolute() SRC_DIR = SCRIPT_DIR / "src" sys.path.insert(0, str(SRC_DIR)) import torch # ═══════════════════════════════════════════════════════════════════════════════ # DATA STRUCTURES # ═══════════════════════════════════════════════════════════════════════════════ @dataclass class BenchmarkResult: """Single benchmark result.""" precision: str num_samples: int num_steps: int topk: int offload: int load_time: float # Model loading time (seconds) gen_time: float # Generation time (seconds) decode_time: float # VAE decoding time (seconds) peak_memory_gb: float # Peak GPU memory usage @property def total_time(self) -> float: return self.gen_time + self.decode_time @property def throughput(self) -> float: """Images per second (generation only).""" return self.num_samples / self.gen_time if self.gen_time > 0 else 0 @property def time_per_step(self) -> float: """Seconds per sampling step.""" return self.gen_time / self.num_steps if self.num_steps > 0 else 0 @property def time_per_image(self) -> float: """Seconds per image (generation only).""" return self.gen_time / self.num_samples if self.num_samples > 0 else 0 # ═══════════════════════════════════════════════════════════════════════════════ # BENCHMARK RUNNER # ═══════════════════════════════════════════════════════════════════════════════ def get_gpu_memory_gb() -> float: """Get current GPU memory usage in GB.""" if torch.cuda.is_available(): return torch.cuda.max_memory_allocated() / (1024 ** 3) return 0.0 def reset_gpu_memory(): """Reset GPU memory tracking.""" if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats() torch.cuda.empty_cache() gc.collect() def run_single_benchmark(precision: str, num_samples: int, num_steps: int, topk: int, offload: int, device: str = 'cuda') -> BenchmarkResult: """Run a single benchmark configuration.""" from generate import load_sampler reset_gpu_memory() # Load model start_load = time.time() sampler = load_sampler(precision=precision, device=device, offload=offload) load_time = time.time() - start_load # Set seed for reproducibility torch.manual_seed(42) if torch.cuda.is_available(): torch.cuda.manual_seed(42) # Warmup run _ = sampler.sample( num_samples=1, text_prompts=["warmup"], cfg_scale=7.5, num_steps=2, use_bf16=(precision == 'bf16'), topk=topk ) reset_gpu_memory() torch.cuda.synchronize() # Timed generation start_gen = time.time() latents = sampler.sample( num_samples=num_samples, text_prompts=["a cute cat"], cfg_scale=7.5, num_steps=num_steps, use_bf16=(precision == 'bf16'), topk=topk ) torch.cuda.synchronize() gen_time = time.time() - start_gen # Timed decoding start_decode = time.time() images = sampler.vae_manager.decode(latents) torch.cuda.synchronize() decode_time = time.time() - start_decode peak_memory = get_gpu_memory_gb() # Cleanup del sampler, latents, images gc.collect() torch.cuda.empty_cache() return BenchmarkResult( precision=precision, num_samples=num_samples, num_steps=num_steps, topk=topk, offload=offload, load_time=load_time, gen_time=gen_time, decode_time=decode_time, peak_memory_gb=peak_memory ) # ═══════════════════════════════════════════════════════════════════════════════ # OUTPUT FORMATTERS # ═══════════════════════════════════════════════════════════════════════════════ def format_terminal_results(results: List[BenchmarkResult], gpu_name: str) -> str: """Format results for terminal display.""" lines = [] lines.append(""" ╔══════════════════════════════════════════════════════════════════════════════╗ ║ 📊 PARIS MoE BENCHMARK RESULTS 📊 ║ ╚══════════════════════════════════════════════════════════════════════════════╝ """) lines.append(f" GPU: {gpu_name}") lines.append(f" Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") lines.append("") # Group by precision precisions = sorted(set(r.precision for r in results)) for precision in precisions: prec_results = [r for r in results if r.precision == precision] lines.append(f"┌{'─'*78}┐") lines.append(f"│ {precision.upper()} Precision{' '*65}│") lines.append(f"├{'─'*78}┤") lines.append(f"│ {'Samples':>8} │ {'Steps':>6} │ {'TopK':>5} │ {'Offload':>7} │ " f"{'Gen(s)':>8} │ {'Img/s':>6} │ {'s/step':>6} │ {'Mem(GB)':>8} │") lines.append(f"├{'─'*78}┤") for r in prec_results: lines.append( f"│ {r.num_samples:>8} │ {r.num_steps:>6} │ {r.topk:>5} │ {r.offload:>7} │ " f"{r.gen_time:>8.2f} │ {r.throughput:>6.2f} │ {r.time_per_step:>6.3f} │ " f"{r.peak_memory_gb:>8.2f} │" ) lines.append(f"└{'─'*78}┘") lines.append("") # Summary if results: fastest = min(results, key=lambda r: r.time_per_image) most_efficient = min(results, key=lambda r: r.peak_memory_gb) lines.append("┌─────────────────────────────────────────────────────────────────┐") lines.append("│ 📈 SUMMARY │") lines.append("├─────────────────────────────────────────────────────────────────┤") lines.append(f"│ 🏆 Fastest: {fastest.precision.upper():>6} @ {fastest.throughput:.2f} img/s │") lines.append(f"│ 💾 Most Efficient: {most_efficient.precision.upper():>6} @ {most_efficient.peak_memory_gb:.1f} GB peak │") lines.append("└─────────────────────────────────────────────────────────────────┘") return "\n".join(lines) def format_markdown_results(results: List[BenchmarkResult], gpu_name: str) -> str: """Format results as Markdown.""" lines = [] lines.append("# 📊 Paris MoE Benchmark Results") lines.append("") lines.append(f"**GPU:** {gpu_name}") lines.append(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") lines.append("") lines.append("## 🏗️ Model Architecture") lines.append("") lines.append("| Component | Details |") lines.append("|-----------|---------|") lines.append("| Experts | 8× DiT-XL/2 (606M params each) |") lines.append("| Router | DiT-B/2 (129M params) |") lines.append("| Total | ~5 Billion parameters |") lines.append("| VAE | SD-VAE (stabilityai/sd-vae-ft-mse) |") lines.append("| Text Encoder | CLIP ViT-L/14 |") lines.append("") # Group by precision precisions = sorted(set(r.precision for r in results)) for precision in precisions: prec_results = [r for r in results if r.precision == precision] lines.append(f"## {precision.upper()} Precision") lines.append("") lines.append("| Samples | Steps | TopK | Offload | Gen Time (s) | Throughput (img/s) | Time/Step (s) | Peak Memory (GB) |") lines.append("|---------|-------|------|---------|--------------|-------------------|---------------|------------------|") for r in prec_results: lines.append( f"| {r.num_samples} | {r.num_steps} | {r.topk} | {r.offload} | " f"{r.gen_time:.2f} | {r.throughput:.2f} | {r.time_per_step:.3f} | {r.peak_memory_gb:.2f} |" ) lines.append("") # Summary if results: lines.append("## 📈 Summary") lines.append("") fastest = min(results, key=lambda r: r.time_per_image) most_efficient = min(results, key=lambda r: r.peak_memory_gb) lines.append(f"- **🏆 Fastest Configuration:** {fastest.precision.upper()}, " f"{fastest.num_samples} samples @ {fastest.throughput:.2f} img/s") lines.append(f"- **💾 Most Memory Efficient:** {most_efficient.precision.upper()} " f"with offload={most_efficient.offload} @ {most_efficient.peak_memory_gb:.1f} GB peak") lines.append("") # Recommendations lines.append("## 🎯 Recommendations") lines.append("") lines.append("| Use Case | Precision | Offload | Expected Performance |") lines.append("|----------|-----------|---------|---------------------|") bf16_results = [r for r in results if r.precision == 'bf16' and r.offload == 0] if bf16_results: r = bf16_results[0] lines.append(f"| **Production (Quality)** | BF16 | 0 | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |") int8_results = [r for r in results if r.precision == 'int8' and r.offload == 0] if int8_results: r = int8_results[0] lines.append(f"| **Balanced** | INT8 | 0 | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |") offload_results = [r for r in results if r.offload > 0] if offload_results: r = min(offload_results, key=lambda x: x.peak_memory_gb) lines.append(f"| **Low VRAM** | {r.precision.upper()} | {r.offload} | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |") lines.append("") lines.append("---") lines.append("*Generated by Paris MoE Benchmark Utility*") return "\n".join(lines) # ═══════════════════════════════════════════════════════════════════════════════ # MAIN # ═══════════════════════════════════════════════════════════════════════════════ def parse_args(): parser = argparse.ArgumentParser( description="📊 Paris MoE - Benchmark Utility", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python benchmark.py # Full benchmark suite python benchmark.py --quick # Quick benchmark python benchmark.py --precision bf16 # BF16 only python benchmark.py --output results.md # Save to file """ ) parser.add_argument("--quick", action="store_true", help="Run quick benchmark with fewer configurations") parser.add_argument("--precision", type=str, default=None, choices=["bf16", "int8", "mixed"], help="Benchmark specific precision only") parser.add_argument("--output", "-o", type=str, default=None, help="Output Markdown file path") parser.add_argument("--samples", type=int, default=None, help="Override number of samples") parser.add_argument("--steps", type=int, default=None, help="Override number of steps") return parser.parse_args() def get_benchmark_configs(args) -> List[Dict]: """Get list of benchmark configurations to run.""" configs = [] if args.quick: # Quick benchmark: minimal configs precisions = [args.precision] if args.precision else ['bf16', 'int8'] samples = args.samples or 4 steps = args.steps or 10 for precision in precisions: configs.append({ 'precision': precision, 'num_samples': samples, 'num_steps': steps, 'topk': 1, 'offload': 0 }) else: # Full benchmark suite precisions = [args.precision] if args.precision else ['bf16', 'int8'] samples_list = [args.samples] if args.samples else [4, 16] steps_list = [args.steps] if args.steps else [20, 30] topk_list = [1, 2] offload_list = [0, 4] for precision in precisions: for samples in samples_list: for steps in steps_list: for topk in topk_list: for offload in offload_list: configs.append({ 'precision': precision, 'num_samples': samples, 'num_steps': steps, 'topk': topk, 'offload': offload }) return configs def main(): args = parse_args() print(""" ╔══════════════════════════════════════════════════════════════════════════════╗ ║ ║ ║ 📊 Paris MoE - Comprehensive Benchmarking Utility 📊 ║ ║ ║ ║ Measuring performance across precision modes, batch sizes, and configs. ║ ║ ║ ╚══════════════════════════════════════════════════════════════════════════════╝ """) device = "cuda" if torch.cuda.is_available() else "cpu" if device != "cuda": print("⚠️ Warning: Running on CPU. Benchmarks will be slow.") gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU" print(f"🖥️ Device: {gpu_name}") configs = get_benchmark_configs(args) print(f"📋 Running {len(configs)} benchmark configurations...\n") results = [] for i, config in enumerate(configs): print(f"[{i+1}/{len(configs)}] {config['precision'].upper()} | " f"{config['num_samples']} samples | {config['num_steps']} steps | " f"Top-{config['topk']} | Offload {config['offload']}") try: result = run_single_benchmark( precision=config['precision'], num_samples=config['num_samples'], num_steps=config['num_steps'], topk=config['topk'], offload=config['offload'], device=device ) results.append(result) print(f" ✅ {result.gen_time:.2f}s, {result.throughput:.2f} img/s, " f"{result.peak_memory_gb:.1f} GB peak") except Exception as e: print(f" ❌ Failed: {e}") print() if not results: print("❌ No successful benchmarks!") return 1 # Print terminal results terminal_output = format_terminal_results(results, gpu_name) print(terminal_output) # Save Markdown if requested if args.output: md_output = format_markdown_results(results, gpu_name) with open(args.output, 'w') as f: f.write(md_output) print(f"\n✅ Results saved to: {args.output}") return 0 if __name__ == "__main__": exit(main())