baguette / benchmark.py
nbagel's picture
Initial upload: Paris MoE inference code and weights
4dec1ca verified
#!/usr/bin/env python3
"""
╔══════════════════════════════════════════════════════════════════════════════╗
β•‘ β•‘
β•‘ πŸ“Š Paris MoE - Comprehensive Benchmarking Utility πŸ“Š β•‘
β•‘ β•‘
β•‘ Measures performance across precision modes, batch sizes, and configs. β•‘
β•‘ Outputs results as both terminal display and Markdown file. β•‘
β•‘ β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
Usage:
python benchmark.py # Run all benchmarks
python benchmark.py --quick # Quick benchmark (fewer configs)
python benchmark.py --precision bf16 # Benchmark specific precision
python benchmark.py --output results.md # Save results to file
"""
import argparse
import sys
import os
import time
import gc
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass
from typing import List, Dict, Optional
SCRIPT_DIR = Path(__file__).parent.absolute()
SRC_DIR = SCRIPT_DIR / "src"
sys.path.insert(0, str(SRC_DIR))
import torch
# ═══════════════════════════════════════════════════════════════════════════════
# DATA STRUCTURES
# ═══════════════════════════════════════════════════════════════════════════════
@dataclass
class BenchmarkResult:
"""Single benchmark result."""
precision: str
num_samples: int
num_steps: int
topk: int
offload: int
load_time: float # Model loading time (seconds)
gen_time: float # Generation time (seconds)
decode_time: float # VAE decoding time (seconds)
peak_memory_gb: float # Peak GPU memory usage
@property
def total_time(self) -> float:
return self.gen_time + self.decode_time
@property
def throughput(self) -> float:
"""Images per second (generation only)."""
return self.num_samples / self.gen_time if self.gen_time > 0 else 0
@property
def time_per_step(self) -> float:
"""Seconds per sampling step."""
return self.gen_time / self.num_steps if self.num_steps > 0 else 0
@property
def time_per_image(self) -> float:
"""Seconds per image (generation only)."""
return self.gen_time / self.num_samples if self.num_samples > 0 else 0
# ═══════════════════════════════════════════════════════════════════════════════
# BENCHMARK RUNNER
# ═══════════════════════════════════════════════════════════════════════════════
def get_gpu_memory_gb() -> float:
"""Get current GPU memory usage in GB."""
if torch.cuda.is_available():
return torch.cuda.max_memory_allocated() / (1024 ** 3)
return 0.0
def reset_gpu_memory():
"""Reset GPU memory tracking."""
if torch.cuda.is_available():
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()
gc.collect()
def run_single_benchmark(precision: str, num_samples: int, num_steps: int,
topk: int, offload: int, device: str = 'cuda') -> BenchmarkResult:
"""Run a single benchmark configuration."""
from generate import load_sampler
reset_gpu_memory()
# Load model
start_load = time.time()
sampler = load_sampler(precision=precision, device=device, offload=offload)
load_time = time.time() - start_load
# Set seed for reproducibility
torch.manual_seed(42)
if torch.cuda.is_available():
torch.cuda.manual_seed(42)
# Warmup run
_ = sampler.sample(
num_samples=1,
text_prompts=["warmup"],
cfg_scale=7.5,
num_steps=2,
use_bf16=(precision == 'bf16'),
topk=topk
)
reset_gpu_memory()
torch.cuda.synchronize()
# Timed generation
start_gen = time.time()
latents = sampler.sample(
num_samples=num_samples,
text_prompts=["a cute cat"],
cfg_scale=7.5,
num_steps=num_steps,
use_bf16=(precision == 'bf16'),
topk=topk
)
torch.cuda.synchronize()
gen_time = time.time() - start_gen
# Timed decoding
start_decode = time.time()
images = sampler.vae_manager.decode(latents)
torch.cuda.synchronize()
decode_time = time.time() - start_decode
peak_memory = get_gpu_memory_gb()
# Cleanup
del sampler, latents, images
gc.collect()
torch.cuda.empty_cache()
return BenchmarkResult(
precision=precision,
num_samples=num_samples,
num_steps=num_steps,
topk=topk,
offload=offload,
load_time=load_time,
gen_time=gen_time,
decode_time=decode_time,
peak_memory_gb=peak_memory
)
# ═══════════════════════════════════════════════════════════════════════════════
# OUTPUT FORMATTERS
# ═══════════════════════════════════════════════════════════════════════════════
def format_terminal_results(results: List[BenchmarkResult], gpu_name: str) -> str:
"""Format results for terminal display."""
lines = []
lines.append("""
╔══════════════════════════════════════════════════════════════════════════════╗
β•‘ πŸ“Š PARIS MoE BENCHMARK RESULTS πŸ“Š β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
""")
lines.append(f" GPU: {gpu_name}")
lines.append(f" Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
lines.append("")
# Group by precision
precisions = sorted(set(r.precision for r in results))
for precision in precisions:
prec_results = [r for r in results if r.precision == precision]
lines.append(f"β”Œ{'─'*78}┐")
lines.append(f"β”‚ {precision.upper()} Precision{' '*65}β”‚")
lines.append(f"β”œ{'─'*78}─")
lines.append(f"β”‚ {'Samples':>8} β”‚ {'Steps':>6} β”‚ {'TopK':>5} β”‚ {'Offload':>7} β”‚ "
f"{'Gen(s)':>8} β”‚ {'Img/s':>6} β”‚ {'s/step':>6} β”‚ {'Mem(GB)':>8} β”‚")
lines.append(f"β”œ{'─'*78}─")
for r in prec_results:
lines.append(
f"β”‚ {r.num_samples:>8} β”‚ {r.num_steps:>6} β”‚ {r.topk:>5} β”‚ {r.offload:>7} β”‚ "
f"{r.gen_time:>8.2f} β”‚ {r.throughput:>6.2f} β”‚ {r.time_per_step:>6.3f} β”‚ "
f"{r.peak_memory_gb:>8.2f} β”‚"
)
lines.append(f"β””{'─'*78}β”˜")
lines.append("")
# Summary
if results:
fastest = min(results, key=lambda r: r.time_per_image)
most_efficient = min(results, key=lambda r: r.peak_memory_gb)
lines.append("β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”")
lines.append("β”‚ πŸ“ˆ SUMMARY β”‚")
lines.append("β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€")
lines.append(f"β”‚ πŸ† Fastest: {fastest.precision.upper():>6} @ {fastest.throughput:.2f} img/s β”‚")
lines.append(f"β”‚ πŸ’Ύ Most Efficient: {most_efficient.precision.upper():>6} @ {most_efficient.peak_memory_gb:.1f} GB peak β”‚")
lines.append("β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜")
return "\n".join(lines)
def format_markdown_results(results: List[BenchmarkResult], gpu_name: str) -> str:
"""Format results as Markdown."""
lines = []
lines.append("# πŸ“Š Paris MoE Benchmark Results")
lines.append("")
lines.append(f"**GPU:** {gpu_name}")
lines.append(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
lines.append("")
lines.append("## πŸ—οΈ Model Architecture")
lines.append("")
lines.append("| Component | Details |")
lines.append("|-----------|---------|")
lines.append("| Experts | 8Γ— DiT-XL/2 (606M params each) |")
lines.append("| Router | DiT-B/2 (129M params) |")
lines.append("| Total | ~5 Billion parameters |")
lines.append("| VAE | SD-VAE (stabilityai/sd-vae-ft-mse) |")
lines.append("| Text Encoder | CLIP ViT-L/14 |")
lines.append("")
# Group by precision
precisions = sorted(set(r.precision for r in results))
for precision in precisions:
prec_results = [r for r in results if r.precision == precision]
lines.append(f"## {precision.upper()} Precision")
lines.append("")
lines.append("| Samples | Steps | TopK | Offload | Gen Time (s) | Throughput (img/s) | Time/Step (s) | Peak Memory (GB) |")
lines.append("|---------|-------|------|---------|--------------|-------------------|---------------|------------------|")
for r in prec_results:
lines.append(
f"| {r.num_samples} | {r.num_steps} | {r.topk} | {r.offload} | "
f"{r.gen_time:.2f} | {r.throughput:.2f} | {r.time_per_step:.3f} | {r.peak_memory_gb:.2f} |"
)
lines.append("")
# Summary
if results:
lines.append("## πŸ“ˆ Summary")
lines.append("")
fastest = min(results, key=lambda r: r.time_per_image)
most_efficient = min(results, key=lambda r: r.peak_memory_gb)
lines.append(f"- **πŸ† Fastest Configuration:** {fastest.precision.upper()}, "
f"{fastest.num_samples} samples @ {fastest.throughput:.2f} img/s")
lines.append(f"- **πŸ’Ύ Most Memory Efficient:** {most_efficient.precision.upper()} "
f"with offload={most_efficient.offload} @ {most_efficient.peak_memory_gb:.1f} GB peak")
lines.append("")
# Recommendations
lines.append("## 🎯 Recommendations")
lines.append("")
lines.append("| Use Case | Precision | Offload | Expected Performance |")
lines.append("|----------|-----------|---------|---------------------|")
bf16_results = [r for r in results if r.precision == 'bf16' and r.offload == 0]
if bf16_results:
r = bf16_results[0]
lines.append(f"| **Production (Quality)** | BF16 | 0 | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |")
int8_results = [r for r in results if r.precision == 'int8' and r.offload == 0]
if int8_results:
r = int8_results[0]
lines.append(f"| **Balanced** | INT8 | 0 | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |")
offload_results = [r for r in results if r.offload > 0]
if offload_results:
r = min(offload_results, key=lambda x: x.peak_memory_gb)
lines.append(f"| **Low VRAM** | {r.precision.upper()} | {r.offload} | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |")
lines.append("")
lines.append("---")
lines.append("*Generated by Paris MoE Benchmark Utility*")
return "\n".join(lines)
# ═══════════════════════════════════════════════════════════════════════════════
# MAIN
# ═══════════════════════════════════════════════════════════════════════════════
def parse_args():
parser = argparse.ArgumentParser(
description="πŸ“Š Paris MoE - Benchmark Utility",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python benchmark.py # Full benchmark suite
python benchmark.py --quick # Quick benchmark
python benchmark.py --precision bf16 # BF16 only
python benchmark.py --output results.md # Save to file
"""
)
parser.add_argument("--quick", action="store_true",
help="Run quick benchmark with fewer configurations")
parser.add_argument("--precision", type=str, default=None,
choices=["bf16", "int8", "mixed"],
help="Benchmark specific precision only")
parser.add_argument("--output", "-o", type=str, default=None,
help="Output Markdown file path")
parser.add_argument("--samples", type=int, default=None,
help="Override number of samples")
parser.add_argument("--steps", type=int, default=None,
help="Override number of steps")
return parser.parse_args()
def get_benchmark_configs(args) -> List[Dict]:
"""Get list of benchmark configurations to run."""
configs = []
if args.quick:
# Quick benchmark: minimal configs
precisions = [args.precision] if args.precision else ['bf16', 'int8']
samples = args.samples or 4
steps = args.steps or 10
for precision in precisions:
configs.append({
'precision': precision,
'num_samples': samples,
'num_steps': steps,
'topk': 1,
'offload': 0
})
else:
# Full benchmark suite
precisions = [args.precision] if args.precision else ['bf16', 'int8']
samples_list = [args.samples] if args.samples else [4, 16]
steps_list = [args.steps] if args.steps else [20, 30]
topk_list = [1, 2]
offload_list = [0, 4]
for precision in precisions:
for samples in samples_list:
for steps in steps_list:
for topk in topk_list:
for offload in offload_list:
configs.append({
'precision': precision,
'num_samples': samples,
'num_steps': steps,
'topk': topk,
'offload': offload
})
return configs
def main():
args = parse_args()
print("""
╔══════════════════════════════════════════════════════════════════════════════╗
β•‘ β•‘
β•‘ πŸ“Š Paris MoE - Comprehensive Benchmarking Utility πŸ“Š β•‘
β•‘ β•‘
β•‘ Measuring performance across precision modes, batch sizes, and configs. β•‘
β•‘ β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
""")
device = "cuda" if torch.cuda.is_available() else "cpu"
if device != "cuda":
print("⚠️ Warning: Running on CPU. Benchmarks will be slow.")
gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
print(f"πŸ–₯️ Device: {gpu_name}")
configs = get_benchmark_configs(args)
print(f"πŸ“‹ Running {len(configs)} benchmark configurations...\n")
results = []
for i, config in enumerate(configs):
print(f"[{i+1}/{len(configs)}] {config['precision'].upper()} | "
f"{config['num_samples']} samples | {config['num_steps']} steps | "
f"Top-{config['topk']} | Offload {config['offload']}")
try:
result = run_single_benchmark(
precision=config['precision'],
num_samples=config['num_samples'],
num_steps=config['num_steps'],
topk=config['topk'],
offload=config['offload'],
device=device
)
results.append(result)
print(f" βœ… {result.gen_time:.2f}s, {result.throughput:.2f} img/s, "
f"{result.peak_memory_gb:.1f} GB peak")
except Exception as e:
print(f" ❌ Failed: {e}")
print()
if not results:
print("❌ No successful benchmarks!")
return 1
# Print terminal results
terminal_output = format_terminal_results(results, gpu_name)
print(terminal_output)
# Save Markdown if requested
if args.output:
md_output = format_markdown_results(results, gpu_name)
with open(args.output, 'w') as f:
f.write(md_output)
print(f"\nβœ… Results saved to: {args.output}")
return 0
if __name__ == "__main__":
exit(main())