baguette / benchmark.py

Initial upload: Paris MoE inference code and weights

4dec1ca verified about 2 months ago

19.3 kB

	#!/usr/bin/env python3
	"""
	╔══════════════════════════════════════════════════════════════════════════════╗
	║ ║
	║ 📊 Paris MoE - Comprehensive Benchmarking Utility 📊 ║
	║ ║
	║ Measures performance across precision modes, batch sizes, and configs. ║
	║ Outputs results as both terminal display and Markdown file. ║
	║ ║
	╚══════════════════════════════════════════════════════════════════════════════╝

	Usage:
	python benchmark.py # Run all benchmarks
	python benchmark.py --quick # Quick benchmark (fewer configs)
	python benchmark.py --precision bf16 # Benchmark specific precision
	python benchmark.py --output results.md # Save results to file
	"""

	import argparse
	import sys
	import os
	import time
	import gc
	from pathlib import Path
	from datetime import datetime
	from dataclasses import dataclass
	from typing import List, Dict, Optional

	SCRIPT_DIR = Path(__file__).parent.absolute()
	SRC_DIR = SCRIPT_DIR / "src"
	sys.path.insert(0, str(SRC_DIR))

	import torch

	# ═══════════════════════════════════════════════════════════════════════════════
	# DATA STRUCTURES
	# ═══════════════════════════════════════════════════════════════════════════════

	@dataclass
	class BenchmarkResult:
	"""Single benchmark result."""
	precision: str
	num_samples: int
	num_steps: int
	topk: int
	offload: int

	load_time: float # Model loading time (seconds)
	gen_time: float # Generation time (seconds)
	decode_time: float # VAE decoding time (seconds)

	peak_memory_gb: float # Peak GPU memory usage

	@property
	def total_time(self) -> float:
	return self.gen_time + self.decode_time

	@property
	def throughput(self) -> float:
	"""Images per second (generation only)."""
	return self.num_samples / self.gen_time if self.gen_time > 0 else 0

	@property
	def time_per_step(self) -> float:
	"""Seconds per sampling step."""
	return self.gen_time / self.num_steps if self.num_steps > 0 else 0

	@property
	def time_per_image(self) -> float:
	"""Seconds per image (generation only)."""
	return self.gen_time / self.num_samples if self.num_samples > 0 else 0


	# ═══════════════════════════════════════════════════════════════════════════════
	# BENCHMARK RUNNER
	# ═══════════════════════════════════════════════════════════════════════════════

	def get_gpu_memory_gb() -> float:
	"""Get current GPU memory usage in GB."""
	if torch.cuda.is_available():
	return torch.cuda.max_memory_allocated() / (1024 ** 3)
	return 0.0


	def reset_gpu_memory():
	"""Reset GPU memory tracking."""
	if torch.cuda.is_available():
	torch.cuda.reset_peak_memory_stats()
	torch.cuda.empty_cache()
	gc.collect()


	def run_single_benchmark(precision: str, num_samples: int, num_steps: int,
	topk: int, offload: int, device: str = 'cuda') -> BenchmarkResult:
	"""Run a single benchmark configuration."""
	from generate import load_sampler

	reset_gpu_memory()

	# Load model
	start_load = time.time()
	sampler = load_sampler(precision=precision, device=device, offload=offload)
	load_time = time.time() - start_load

	# Set seed for reproducibility
	torch.manual_seed(42)
	if torch.cuda.is_available():
	torch.cuda.manual_seed(42)

	# Warmup run
	_ = sampler.sample(
	num_samples=1,
	text_prompts=["warmup"],
	cfg_scale=7.5,
	num_steps=2,
	use_bf16=(precision == 'bf16'),
	topk=topk
	)

	reset_gpu_memory()
	torch.cuda.synchronize()

	# Timed generation
	start_gen = time.time()
	latents = sampler.sample(
	num_samples=num_samples,
	text_prompts=["a cute cat"],
	cfg_scale=7.5,
	num_steps=num_steps,
	use_bf16=(precision == 'bf16'),
	topk=topk
	)
	torch.cuda.synchronize()
	gen_time = time.time() - start_gen

	# Timed decoding
	start_decode = time.time()
	images = sampler.vae_manager.decode(latents)
	torch.cuda.synchronize()
	decode_time = time.time() - start_decode

	peak_memory = get_gpu_memory_gb()

	# Cleanup
	del sampler, latents, images
	gc.collect()
	torch.cuda.empty_cache()

	return BenchmarkResult(
	precision=precision,
	num_samples=num_samples,
	num_steps=num_steps,
	topk=topk,
	offload=offload,
	load_time=load_time,
	gen_time=gen_time,
	decode_time=decode_time,
	peak_memory_gb=peak_memory
	)


	# ═══════════════════════════════════════════════════════════════════════════════
	# OUTPUT FORMATTERS
	# ═══════════════════════════════════════════════════════════════════════════════

	def format_terminal_results(results: List[BenchmarkResult], gpu_name: str) -> str:
	"""Format results for terminal display."""
	lines = []

	lines.append("""
	╔══════════════════════════════════════════════════════════════════════════════╗
	║ 📊 PARIS MoE BENCHMARK RESULTS 📊 ║
	╚══════════════════════════════════════════════════════════════════════════════╝
	""")

	lines.append(f" GPU: {gpu_name}")
	lines.append(f" Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
	lines.append("")

	# Group by precision
	precisions = sorted(set(r.precision for r in results))

	for precision in precisions:
	prec_results = [r for r in results if r.precision == precision]

	lines.append(f"┌{'─'*78}┐")
	lines.append(f"│ {precision.upper()} Precision{' '*65}│")
	lines.append(f"├{'─'*78}┤")
	lines.append(f"│ {'Samples':>8} │ {'Steps':>6} │ {'TopK':>5} │ {'Offload':>7} │ "
	f"{'Gen(s)':>8} │ {'Img/s':>6} │ {'s/step':>6} │ {'Mem(GB)':>8} │")
	lines.append(f"├{'─'*78}┤")

	for r in prec_results:
	lines.append(
	f"│ {r.num_samples:>8} │ {r.num_steps:>6} │ {r.topk:>5} │ {r.offload:>7} │ "
	f"{r.gen_time:>8.2f} │ {r.throughput:>6.2f} │ {r.time_per_step:>6.3f} │ "
	f"{r.peak_memory_gb:>8.2f} │"
	)

	lines.append(f"└{'─'*78}┘")
	lines.append("")

	# Summary
	if results:
	fastest = min(results, key=lambda r: r.time_per_image)
	most_efficient = min(results, key=lambda r: r.peak_memory_gb)

	lines.append("┌─────────────────────────────────────────────────────────────────┐")
	lines.append("│ 📈 SUMMARY │")
	lines.append("├─────────────────────────────────────────────────────────────────┤")
	lines.append(f"│ 🏆 Fastest: {fastest.precision.upper():>6} @ {fastest.throughput:.2f} img/s │")
	lines.append(f"│ 💾 Most Efficient: {most_efficient.precision.upper():>6} @ {most_efficient.peak_memory_gb:.1f} GB peak │")
	lines.append("└─────────────────────────────────────────────────────────────────┘")

	return "\n".join(lines)


	def format_markdown_results(results: List[BenchmarkResult], gpu_name: str) -> str:
	"""Format results as Markdown."""
	lines = []

	lines.append("# 📊 Paris MoE Benchmark Results")
	lines.append("")
	lines.append(f"GPU: {gpu_name}")
	lines.append(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
	lines.append("")

	lines.append("## 🏗️ Model Architecture")
	lines.append("")
	lines.append("\| Component \| Details \|")
	lines.append("\|-----------\|---------\|")
	lines.append("\| Experts \| 8× DiT-XL/2 (606M params each) \|")
	lines.append("\| Router \| DiT-B/2 (129M params) \|")
	lines.append("\| Total \| ~5 Billion parameters \|")
	lines.append("\| VAE \| SD-VAE (stabilityai/sd-vae-ft-mse) \|")
	lines.append("\| Text Encoder \| CLIP ViT-L/14 \|")
	lines.append("")

	# Group by precision
	precisions = sorted(set(r.precision for r in results))

	for precision in precisions:
	prec_results = [r for r in results if r.precision == precision]

	lines.append(f"## {precision.upper()} Precision")
	lines.append("")
	lines.append("\| Samples \| Steps \| TopK \| Offload \| Gen Time (s) \| Throughput (img/s) \| Time/Step (s) \| Peak Memory (GB) \|")
	lines.append("\|---------\|-------\|------\|---------\|--------------\|-------------------\|---------------\|------------------\|")

	for r in prec_results:
	lines.append(
	f"\| {r.num_samples} \| {r.num_steps} \| {r.topk} \| {r.offload} \| "
	f"{r.gen_time:.2f} \| {r.throughput:.2f} \| {r.time_per_step:.3f} \| {r.peak_memory_gb:.2f} \|"
	)

	lines.append("")

	# Summary
	if results:
	lines.append("## 📈 Summary")
	lines.append("")

	fastest = min(results, key=lambda r: r.time_per_image)
	most_efficient = min(results, key=lambda r: r.peak_memory_gb)

	lines.append(f"- 🏆 Fastest Configuration: {fastest.precision.upper()}, "
	f"{fastest.num_samples} samples @ {fastest.throughput:.2f} img/s")
	lines.append(f"- 💾 Most Memory Efficient: {most_efficient.precision.upper()} "
	f"with offload={most_efficient.offload} @ {most_efficient.peak_memory_gb:.1f} GB peak")
	lines.append("")

	# Recommendations
	lines.append("## 🎯 Recommendations")
	lines.append("")
	lines.append("\| Use Case \| Precision \| Offload \| Expected Performance \|")
	lines.append("\|----------\|-----------\|---------\|---------------------\|")

	bf16_results = [r for r in results if r.precision == 'bf16' and r.offload == 0]
	if bf16_results:
	r = bf16_results[0]
	lines.append(f"\| Production (Quality) \| BF16 \| 0 \| {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB \|")

	int8_results = [r for r in results if r.precision == 'int8' and r.offload == 0]
	if int8_results:
	r = int8_results[0]
	lines.append(f"\| Balanced \| INT8 \| 0 \| {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB \|")

	offload_results = [r for r in results if r.offload > 0]
	if offload_results:
	r = min(offload_results, key=lambda x: x.peak_memory_gb)
	lines.append(f"\| Low VRAM \| {r.precision.upper()} \| {r.offload} \| {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB \|")

	lines.append("")
	lines.append("---")
	lines.append("Generated by Paris MoE Benchmark Utility")

	return "\n".join(lines)


	# ═══════════════════════════════════════════════════════════════════════════════
	# MAIN
	# ═══════════════════════════════════════════════════════════════════════════════

	def parse_args():
	parser = argparse.ArgumentParser(
	description="📊 Paris MoE - Benchmark Utility",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	python benchmark.py # Full benchmark suite
	python benchmark.py --quick # Quick benchmark
	python benchmark.py --precision bf16 # BF16 only
	python benchmark.py --output results.md # Save to file
	"""
	)

	parser.add_argument("--quick", action="store_true",
	help="Run quick benchmark with fewer configurations")
	parser.add_argument("--precision", type=str, default=None,
	choices=["bf16", "int8", "mixed"],
	help="Benchmark specific precision only")
	parser.add_argument("--output", "-o", type=str, default=None,
	help="Output Markdown file path")
	parser.add_argument("--samples", type=int, default=None,
	help="Override number of samples")
	parser.add_argument("--steps", type=int, default=None,
	help="Override number of steps")

	return parser.parse_args()


	def get_benchmark_configs(args) -> List[Dict]:
	"""Get list of benchmark configurations to run."""
	configs = []

	if args.quick:
	# Quick benchmark: minimal configs
	precisions = [args.precision] if args.precision else ['bf16', 'int8']
	samples = args.samples or 4
	steps = args.steps or 10

	for precision in precisions:
	configs.append({
	'precision': precision,
	'num_samples': samples,
	'num_steps': steps,
	'topk': 1,
	'offload': 0
	})
	else:
	# Full benchmark suite
	precisions = [args.precision] if args.precision else ['bf16', 'int8']
	samples_list = [args.samples] if args.samples else [4, 16]
	steps_list = [args.steps] if args.steps else [20, 30]
	topk_list = [1, 2]
	offload_list = [0, 4]

	for precision in precisions:
	for samples in samples_list:
	for steps in steps_list:
	for topk in topk_list:
	for offload in offload_list:
	configs.append({
	'precision': precision,
	'num_samples': samples,
	'num_steps': steps,
	'topk': topk,
	'offload': offload
	})

	return configs


	def main():
	args = parse_args()

	print("""
	╔══════════════════════════════════════════════════════════════════════════════╗
	║ ║
	║ 📊 Paris MoE - Comprehensive Benchmarking Utility 📊 ║
	║ ║
	║ Measuring performance across precision modes, batch sizes, and configs. ║
	║ ║
	╚══════════════════════════════════════════════════════════════════════════════╝
	""")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	if device != "cuda":
	print("⚠️ Warning: Running on CPU. Benchmarks will be slow.")

	gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
	print(f"🖥️ Device: {gpu_name}")

	configs = get_benchmark_configs(args)
	print(f"📋 Running {len(configs)} benchmark configurations...\n")

	results = []

	for i, config in enumerate(configs):
	print(f"[{i+1}/{len(configs)}] {config['precision'].upper()} \| "
	f"{config['num_samples']} samples \| {config['num_steps']} steps \| "
	f"Top-{config['topk']} \| Offload {config['offload']}")

	try:
	result = run_single_benchmark(
	precision=config['precision'],
	num_samples=config['num_samples'],
	num_steps=config['num_steps'],
	topk=config['topk'],
	offload=config['offload'],
	device=device
	)
	results.append(result)
	print(f" ✅ {result.gen_time:.2f}s, {result.throughput:.2f} img/s, "
	f"{result.peak_memory_gb:.1f} GB peak")
	except Exception as e:
	print(f" ❌ Failed: {e}")

	print()

	if not results:
	print("❌ No successful benchmarks!")
	return 1

	# Print terminal results
	terminal_output = format_terminal_results(results, gpu_name)
	print(terminal_output)

	# Save Markdown if requested
	if args.output:
	md_output = format_markdown_results(results, gpu_name)
	with open(args.output, 'w') as f:
	f.write(md_output)
	print(f"\n✅ Results saved to: {args.output}")

	return 0


	if __name__ == "__main__":
	exit(main())