|
|
|
|
|
""" |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
β β |
|
|
β π Paris MoE - Comprehensive Benchmarking Utility π β |
|
|
β β |
|
|
β Measures performance across precision modes, batch sizes, and configs. β |
|
|
β Outputs results as both terminal display and Markdown file. β |
|
|
β β |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
|
|
|
Usage: |
|
|
python benchmark.py # Run all benchmarks |
|
|
python benchmark.py --quick # Quick benchmark (fewer configs) |
|
|
python benchmark.py --precision bf16 # Benchmark specific precision |
|
|
python benchmark.py --output results.md # Save results to file |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import sys |
|
|
import os |
|
|
import time |
|
|
import gc |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
from dataclasses import dataclass |
|
|
from typing import List, Dict, Optional |
|
|
|
|
|
SCRIPT_DIR = Path(__file__).parent.absolute() |
|
|
SRC_DIR = SCRIPT_DIR / "src" |
|
|
sys.path.insert(0, str(SRC_DIR)) |
|
|
|
|
|
import torch |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
class BenchmarkResult: |
|
|
"""Single benchmark result.""" |
|
|
precision: str |
|
|
num_samples: int |
|
|
num_steps: int |
|
|
topk: int |
|
|
offload: int |
|
|
|
|
|
load_time: float |
|
|
gen_time: float |
|
|
decode_time: float |
|
|
|
|
|
peak_memory_gb: float |
|
|
|
|
|
@property |
|
|
def total_time(self) -> float: |
|
|
return self.gen_time + self.decode_time |
|
|
|
|
|
@property |
|
|
def throughput(self) -> float: |
|
|
"""Images per second (generation only).""" |
|
|
return self.num_samples / self.gen_time if self.gen_time > 0 else 0 |
|
|
|
|
|
@property |
|
|
def time_per_step(self) -> float: |
|
|
"""Seconds per sampling step.""" |
|
|
return self.gen_time / self.num_steps if self.num_steps > 0 else 0 |
|
|
|
|
|
@property |
|
|
def time_per_image(self) -> float: |
|
|
"""Seconds per image (generation only).""" |
|
|
return self.gen_time / self.num_samples if self.num_samples > 0 else 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_gpu_memory_gb() -> float: |
|
|
"""Get current GPU memory usage in GB.""" |
|
|
if torch.cuda.is_available(): |
|
|
return torch.cuda.max_memory_allocated() / (1024 ** 3) |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
def reset_gpu_memory(): |
|
|
"""Reset GPU memory tracking.""" |
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.reset_peak_memory_stats() |
|
|
torch.cuda.empty_cache() |
|
|
gc.collect() |
|
|
|
|
|
|
|
|
def run_single_benchmark(precision: str, num_samples: int, num_steps: int, |
|
|
topk: int, offload: int, device: str = 'cuda') -> BenchmarkResult: |
|
|
"""Run a single benchmark configuration.""" |
|
|
from generate import load_sampler |
|
|
|
|
|
reset_gpu_memory() |
|
|
|
|
|
|
|
|
start_load = time.time() |
|
|
sampler = load_sampler(precision=precision, device=device, offload=offload) |
|
|
load_time = time.time() - start_load |
|
|
|
|
|
|
|
|
torch.manual_seed(42) |
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.manual_seed(42) |
|
|
|
|
|
|
|
|
_ = sampler.sample( |
|
|
num_samples=1, |
|
|
text_prompts=["warmup"], |
|
|
cfg_scale=7.5, |
|
|
num_steps=2, |
|
|
use_bf16=(precision == 'bf16'), |
|
|
topk=topk |
|
|
) |
|
|
|
|
|
reset_gpu_memory() |
|
|
torch.cuda.synchronize() |
|
|
|
|
|
|
|
|
start_gen = time.time() |
|
|
latents = sampler.sample( |
|
|
num_samples=num_samples, |
|
|
text_prompts=["a cute cat"], |
|
|
cfg_scale=7.5, |
|
|
num_steps=num_steps, |
|
|
use_bf16=(precision == 'bf16'), |
|
|
topk=topk |
|
|
) |
|
|
torch.cuda.synchronize() |
|
|
gen_time = time.time() - start_gen |
|
|
|
|
|
|
|
|
start_decode = time.time() |
|
|
images = sampler.vae_manager.decode(latents) |
|
|
torch.cuda.synchronize() |
|
|
decode_time = time.time() - start_decode |
|
|
|
|
|
peak_memory = get_gpu_memory_gb() |
|
|
|
|
|
|
|
|
del sampler, latents, images |
|
|
gc.collect() |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
return BenchmarkResult( |
|
|
precision=precision, |
|
|
num_samples=num_samples, |
|
|
num_steps=num_steps, |
|
|
topk=topk, |
|
|
offload=offload, |
|
|
load_time=load_time, |
|
|
gen_time=gen_time, |
|
|
decode_time=decode_time, |
|
|
peak_memory_gb=peak_memory |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def format_terminal_results(results: List[BenchmarkResult], gpu_name: str) -> str: |
|
|
"""Format results for terminal display.""" |
|
|
lines = [] |
|
|
|
|
|
lines.append(""" |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
β π PARIS MoE BENCHMARK RESULTS π β |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
""") |
|
|
|
|
|
lines.append(f" GPU: {gpu_name}") |
|
|
lines.append(f" Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
|
|
lines.append("") |
|
|
|
|
|
|
|
|
precisions = sorted(set(r.precision for r in results)) |
|
|
|
|
|
for precision in precisions: |
|
|
prec_results = [r for r in results if r.precision == precision] |
|
|
|
|
|
lines.append(f"β{'β'*78}β") |
|
|
lines.append(f"β {precision.upper()} Precision{' '*65}β") |
|
|
lines.append(f"β{'β'*78}β€") |
|
|
lines.append(f"β {'Samples':>8} β {'Steps':>6} β {'TopK':>5} β {'Offload':>7} β " |
|
|
f"{'Gen(s)':>8} β {'Img/s':>6} β {'s/step':>6} β {'Mem(GB)':>8} β") |
|
|
lines.append(f"β{'β'*78}β€") |
|
|
|
|
|
for r in prec_results: |
|
|
lines.append( |
|
|
f"β {r.num_samples:>8} β {r.num_steps:>6} β {r.topk:>5} β {r.offload:>7} β " |
|
|
f"{r.gen_time:>8.2f} β {r.throughput:>6.2f} β {r.time_per_step:>6.3f} β " |
|
|
f"{r.peak_memory_gb:>8.2f} β" |
|
|
) |
|
|
|
|
|
lines.append(f"β{'β'*78}β") |
|
|
lines.append("") |
|
|
|
|
|
|
|
|
if results: |
|
|
fastest = min(results, key=lambda r: r.time_per_image) |
|
|
most_efficient = min(results, key=lambda r: r.peak_memory_gb) |
|
|
|
|
|
lines.append("βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") |
|
|
lines.append("β π SUMMARY β") |
|
|
lines.append("βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€") |
|
|
lines.append(f"β π Fastest: {fastest.precision.upper():>6} @ {fastest.throughput:.2f} img/s β") |
|
|
lines.append(f"β πΎ Most Efficient: {most_efficient.precision.upper():>6} @ {most_efficient.peak_memory_gb:.1f} GB peak β") |
|
|
lines.append("βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
def format_markdown_results(results: List[BenchmarkResult], gpu_name: str) -> str: |
|
|
"""Format results as Markdown.""" |
|
|
lines = [] |
|
|
|
|
|
lines.append("# π Paris MoE Benchmark Results") |
|
|
lines.append("") |
|
|
lines.append(f"**GPU:** {gpu_name}") |
|
|
lines.append(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
|
|
lines.append("") |
|
|
|
|
|
lines.append("## ποΈ Model Architecture") |
|
|
lines.append("") |
|
|
lines.append("| Component | Details |") |
|
|
lines.append("|-----------|---------|") |
|
|
lines.append("| Experts | 8Γ DiT-XL/2 (606M params each) |") |
|
|
lines.append("| Router | DiT-B/2 (129M params) |") |
|
|
lines.append("| Total | ~5 Billion parameters |") |
|
|
lines.append("| VAE | SD-VAE (stabilityai/sd-vae-ft-mse) |") |
|
|
lines.append("| Text Encoder | CLIP ViT-L/14 |") |
|
|
lines.append("") |
|
|
|
|
|
|
|
|
precisions = sorted(set(r.precision for r in results)) |
|
|
|
|
|
for precision in precisions: |
|
|
prec_results = [r for r in results if r.precision == precision] |
|
|
|
|
|
lines.append(f"## {precision.upper()} Precision") |
|
|
lines.append("") |
|
|
lines.append("| Samples | Steps | TopK | Offload | Gen Time (s) | Throughput (img/s) | Time/Step (s) | Peak Memory (GB) |") |
|
|
lines.append("|---------|-------|------|---------|--------------|-------------------|---------------|------------------|") |
|
|
|
|
|
for r in prec_results: |
|
|
lines.append( |
|
|
f"| {r.num_samples} | {r.num_steps} | {r.topk} | {r.offload} | " |
|
|
f"{r.gen_time:.2f} | {r.throughput:.2f} | {r.time_per_step:.3f} | {r.peak_memory_gb:.2f} |" |
|
|
) |
|
|
|
|
|
lines.append("") |
|
|
|
|
|
|
|
|
if results: |
|
|
lines.append("## π Summary") |
|
|
lines.append("") |
|
|
|
|
|
fastest = min(results, key=lambda r: r.time_per_image) |
|
|
most_efficient = min(results, key=lambda r: r.peak_memory_gb) |
|
|
|
|
|
lines.append(f"- **π Fastest Configuration:** {fastest.precision.upper()}, " |
|
|
f"{fastest.num_samples} samples @ {fastest.throughput:.2f} img/s") |
|
|
lines.append(f"- **πΎ Most Memory Efficient:** {most_efficient.precision.upper()} " |
|
|
f"with offload={most_efficient.offload} @ {most_efficient.peak_memory_gb:.1f} GB peak") |
|
|
lines.append("") |
|
|
|
|
|
|
|
|
lines.append("## π― Recommendations") |
|
|
lines.append("") |
|
|
lines.append("| Use Case | Precision | Offload | Expected Performance |") |
|
|
lines.append("|----------|-----------|---------|---------------------|") |
|
|
|
|
|
bf16_results = [r for r in results if r.precision == 'bf16' and r.offload == 0] |
|
|
if bf16_results: |
|
|
r = bf16_results[0] |
|
|
lines.append(f"| **Production (Quality)** | BF16 | 0 | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |") |
|
|
|
|
|
int8_results = [r for r in results if r.precision == 'int8' and r.offload == 0] |
|
|
if int8_results: |
|
|
r = int8_results[0] |
|
|
lines.append(f"| **Balanced** | INT8 | 0 | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |") |
|
|
|
|
|
offload_results = [r for r in results if r.offload > 0] |
|
|
if offload_results: |
|
|
r = min(offload_results, key=lambda x: x.peak_memory_gb) |
|
|
lines.append(f"| **Low VRAM** | {r.precision.upper()} | {r.offload} | {r.throughput:.2f} img/s, {r.peak_memory_gb:.1f} GB |") |
|
|
|
|
|
lines.append("") |
|
|
lines.append("---") |
|
|
lines.append("*Generated by Paris MoE Benchmark Utility*") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_args(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="π Paris MoE - Benchmark Utility", |
|
|
formatter_class=argparse.RawDescriptionHelpFormatter, |
|
|
epilog=""" |
|
|
Examples: |
|
|
python benchmark.py # Full benchmark suite |
|
|
python benchmark.py --quick # Quick benchmark |
|
|
python benchmark.py --precision bf16 # BF16 only |
|
|
python benchmark.py --output results.md # Save to file |
|
|
""" |
|
|
) |
|
|
|
|
|
parser.add_argument("--quick", action="store_true", |
|
|
help="Run quick benchmark with fewer configurations") |
|
|
parser.add_argument("--precision", type=str, default=None, |
|
|
choices=["bf16", "int8", "mixed"], |
|
|
help="Benchmark specific precision only") |
|
|
parser.add_argument("--output", "-o", type=str, default=None, |
|
|
help="Output Markdown file path") |
|
|
parser.add_argument("--samples", type=int, default=None, |
|
|
help="Override number of samples") |
|
|
parser.add_argument("--steps", type=int, default=None, |
|
|
help="Override number of steps") |
|
|
|
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def get_benchmark_configs(args) -> List[Dict]: |
|
|
"""Get list of benchmark configurations to run.""" |
|
|
configs = [] |
|
|
|
|
|
if args.quick: |
|
|
|
|
|
precisions = [args.precision] if args.precision else ['bf16', 'int8'] |
|
|
samples = args.samples or 4 |
|
|
steps = args.steps or 10 |
|
|
|
|
|
for precision in precisions: |
|
|
configs.append({ |
|
|
'precision': precision, |
|
|
'num_samples': samples, |
|
|
'num_steps': steps, |
|
|
'topk': 1, |
|
|
'offload': 0 |
|
|
}) |
|
|
else: |
|
|
|
|
|
precisions = [args.precision] if args.precision else ['bf16', 'int8'] |
|
|
samples_list = [args.samples] if args.samples else [4, 16] |
|
|
steps_list = [args.steps] if args.steps else [20, 30] |
|
|
topk_list = [1, 2] |
|
|
offload_list = [0, 4] |
|
|
|
|
|
for precision in precisions: |
|
|
for samples in samples_list: |
|
|
for steps in steps_list: |
|
|
for topk in topk_list: |
|
|
for offload in offload_list: |
|
|
configs.append({ |
|
|
'precision': precision, |
|
|
'num_samples': samples, |
|
|
'num_steps': steps, |
|
|
'topk': topk, |
|
|
'offload': offload |
|
|
}) |
|
|
|
|
|
return configs |
|
|
|
|
|
|
|
|
def main(): |
|
|
args = parse_args() |
|
|
|
|
|
print(""" |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
β β |
|
|
β π Paris MoE - Comprehensive Benchmarking Utility π β |
|
|
β β |
|
|
β Measuring performance across precision modes, batch sizes, and configs. β |
|
|
β β |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
""") |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
if device != "cuda": |
|
|
print("β οΈ Warning: Running on CPU. Benchmarks will be slow.") |
|
|
|
|
|
gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU" |
|
|
print(f"π₯οΈ Device: {gpu_name}") |
|
|
|
|
|
configs = get_benchmark_configs(args) |
|
|
print(f"π Running {len(configs)} benchmark configurations...\n") |
|
|
|
|
|
results = [] |
|
|
|
|
|
for i, config in enumerate(configs): |
|
|
print(f"[{i+1}/{len(configs)}] {config['precision'].upper()} | " |
|
|
f"{config['num_samples']} samples | {config['num_steps']} steps | " |
|
|
f"Top-{config['topk']} | Offload {config['offload']}") |
|
|
|
|
|
try: |
|
|
result = run_single_benchmark( |
|
|
precision=config['precision'], |
|
|
num_samples=config['num_samples'], |
|
|
num_steps=config['num_steps'], |
|
|
topk=config['topk'], |
|
|
offload=config['offload'], |
|
|
device=device |
|
|
) |
|
|
results.append(result) |
|
|
print(f" β
{result.gen_time:.2f}s, {result.throughput:.2f} img/s, " |
|
|
f"{result.peak_memory_gb:.1f} GB peak") |
|
|
except Exception as e: |
|
|
print(f" β Failed: {e}") |
|
|
|
|
|
print() |
|
|
|
|
|
if not results: |
|
|
print("β No successful benchmarks!") |
|
|
return 1 |
|
|
|
|
|
|
|
|
terminal_output = format_terminal_results(results, gpu_name) |
|
|
print(terminal_output) |
|
|
|
|
|
|
|
|
if args.output: |
|
|
md_output = format_markdown_results(results, gpu_name) |
|
|
with open(args.output, 'w') as f: |
|
|
f.write(md_output) |
|
|
print(f"\nβ
Results saved to: {args.output}") |
|
|
|
|
|
return 0 |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
exit(main()) |
|
|
|