LightDiffusion-Next / tests /benchmark_performance.py
Aatricks's picture
Deploy ZeroGPU Gradio Space snapshot
b701455
"""Performance benchmark comparing LightDiffusion-Next across SD1.5, SDXL, and Flux2 Klein 4B.
This script measures:
- Sampling time (model inference + denoising loop)
- Total generation time (includes VAE decode, model load)
- VRAM usage peak
Usage:
cd LightDiffusion-Next
python tests/benchmark_performance.py
"""
import gc
import json
import os
import sys
import time
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Optional
import torch
# Add project root to path
project_root = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(project_root))
@dataclass
class BenchmarkResult:
model_type: str
model_name: str
resolution: tuple[int, int]
steps: int
sampler: str
scheduler: str
warmup_time_s: float = 0.0
generation_time_s: float = 0.0
sampling_time_s: float = 0.0
vae_decode_time_s: float = 0.0
peak_vram_mb: float = 0.0
batch_size: int = 1
cfg_scale: float = 7.0
notes: str = ""
@dataclass
class BenchmarkSuite:
results: list[BenchmarkResult] = field(default_factory=list)
system_info: dict = field(default_factory=dict)
def add(self, result: BenchmarkResult):
self.results.append(result)
def to_json(self, path: str):
data = {
"system_info": self.system_info,
"results": [asdict(r) for r in self.results]
}
with open(path, "w") as f:
json.dump(data, f, indent=2)
print(f"Results saved to {path}")
def get_system_info() -> dict:
"""Collect system information for benchmark context."""
info = {
"python_version": sys.version,
"torch_version": torch.__version__,
"cuda_available": torch.cuda.is_available(),
}
if torch.cuda.is_available():
info["cuda_version"] = torch.version.cuda
info["gpu_name"] = torch.cuda.get_device_name(0)
info["gpu_vram_total_mb"] = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2)
return info
def reset_cuda():
"""Clear CUDA cache for accurate VRAM measurement."""
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
gc.collect()
def get_peak_vram_mb() -> float:
"""Get peak VRAM usage in MB."""
if torch.cuda.is_available():
return torch.cuda.max_memory_allocated() / (1024 ** 2)
return 0.0
def benchmark_sd15(suite: BenchmarkSuite, warmup: bool = True):
"""Benchmark SD1.5 with DreamShaper 8."""
from src.user.pipeline import pipeline
model_path = "./include/checkpoints/DreamShaper_8_pruned.safetensors"
if not Path(model_path).exists():
print(f" Skipping SD1.5 - model not found: {model_path}")
return
prompt = "a beautiful sunset over a mountain landscape, high quality photograph"
w, h = 512, 512
steps = 20
sampler = "euler"
scheduler = "normal"
print(f"\n{'='*60}")
print(f"SD1.5 Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}")
print(f"{'='*60}")
# Warmup run
warmup_time = 0.0
if warmup:
print(" Warmup run...")
reset_cuda()
t0 = time.perf_counter()
pipeline(
prompt=prompt,
w=w, h=h,
steps=5,
sampler=sampler,
scheduler=scheduler,
model_path=model_path,
hires_fix=False,
adetailer=False,
autohdr=False,
enable_multiscale=False,
)
warmup_time = time.perf_counter() - t0
print(f" Warmup done in {warmup_time:.2f}s")
# Timed run
print(" Benchmark run...")
reset_cuda()
t0 = time.perf_counter()
pipeline(
prompt=prompt,
w=w, h=h,
steps=steps,
sampler=sampler,
scheduler=scheduler,
model_path=model_path,
hires_fix=False,
adetailer=False,
autohdr=False,
enable_multiscale=False,
)
gen_time = time.perf_counter() - t0
peak_vram = get_peak_vram_mb()
result = BenchmarkResult(
model_type="SD1.5",
model_name="DreamShaper_8",
resolution=(w, h),
steps=steps,
sampler=sampler,
scheduler=scheduler,
warmup_time_s=warmup_time,
generation_time_s=gen_time,
peak_vram_mb=peak_vram,
)
suite.add(result)
print(f" Generation time: {gen_time:.2f}s")
print(f" Steps/second: {steps / gen_time:.2f}")
print(f" Peak VRAM: {peak_vram:.0f} MB")
def benchmark_sdxl(suite: BenchmarkSuite, warmup: bool = True):
"""Benchmark SDXL with Juggernaut-XL v9."""
from src.user.pipeline import pipeline
model_path = "./include/checkpoints/Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors"
if not Path(model_path).exists():
print(f" Skipping SDXL - model not found: {model_path}")
return
prompt = "a beautiful sunset over a mountain landscape, high quality photograph"
w, h = 1024, 1024
steps = 20
sampler = "euler"
scheduler = "ays" # AYS is commonly used for SDXL
print(f"\n{'='*60}")
print(f"SDXL Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}")
print(f"{'='*60}")
# Warmup run
warmup_time = 0.0
if warmup:
print(" Warmup run...")
reset_cuda()
t0 = time.perf_counter()
pipeline(
prompt=prompt,
w=w, h=h,
steps=5,
sampler=sampler,
scheduler=scheduler,
model_path=model_path,
hires_fix=False,
adetailer=False,
autohdr=False,
enable_multiscale=False,
)
warmup_time = time.perf_counter() - t0
print(f" Warmup done in {warmup_time:.2f}s")
# Timed run
print(" Benchmark run...")
reset_cuda()
t0 = time.perf_counter()
pipeline(
prompt=prompt,
w=w, h=h,
steps=steps,
sampler=sampler,
scheduler=scheduler,
model_path=model_path,
hires_fix=False,
adetailer=False,
autohdr=False,
enable_multiscale=False,
)
gen_time = time.perf_counter() - t0
peak_vram = get_peak_vram_mb()
result = BenchmarkResult(
model_type="SDXL",
model_name="Juggernaut-XL_v9",
resolution=(w, h),
steps=steps,
sampler=sampler,
scheduler=scheduler,
warmup_time_s=warmup_time,
generation_time_s=gen_time,
peak_vram_mb=peak_vram,
)
suite.add(result)
print(f" Generation time: {gen_time:.2f}s")
print(f" Steps/second: {steps / gen_time:.2f}")
print(f" Peak VRAM: {peak_vram:.0f} MB")
def benchmark_flux2_klein(suite: BenchmarkSuite, warmup: bool = True):
"""Benchmark Flux2 Klein 4B."""
from src.user.pipeline import pipeline
model_path = "__FLUX2_KLEIN__" # Special marker for Flux2 Klein
diffusion_path = "./include/diffusion_model/flux-2-klein-4b.safetensors"
if not Path(diffusion_path).exists():
print(f" Skipping Flux2 Klein - model not found: {diffusion_path}")
return
prompt = "a beautiful sunset over a mountain landscape"
w, h = 1024, 1024
steps = 4 # Flux2 Klein is distilled, uses fewer steps
sampler = "euler"
scheduler = "simple"
cfg = 1.0 # Distilled models use CFG=1
print(f"\n{'='*60}")
print(f"Flux2 Klein Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}")
print(f"{'='*60}")
# Warmup run
warmup_time = 0.0
if warmup:
print(" Warmup run...")
reset_cuda()
t0 = time.perf_counter()
pipeline(
prompt=prompt,
w=w, h=h,
steps=2,
sampler=sampler,
scheduler=scheduler,
model_path=model_path,
cfg_scale=cfg,
hires_fix=False,
adetailer=False,
autohdr=False,
enable_multiscale=False,
)
warmup_time = time.perf_counter() - t0
print(f" Warmup done in {warmup_time:.2f}s")
# Timed run
print(" Benchmark run...")
reset_cuda()
t0 = time.perf_counter()
pipeline(
prompt=prompt,
w=w, h=h,
steps=steps,
sampler=sampler,
scheduler=scheduler,
model_path=model_path,
cfg_scale=cfg,
hires_fix=False,
adetailer=False,
autohdr=False,
enable_multiscale=False,
)
gen_time = time.perf_counter() - t0
peak_vram = get_peak_vram_mb()
result = BenchmarkResult(
model_type="Flux2",
model_name="flux-2-klein-4b",
resolution=(w, h),
steps=steps,
sampler=sampler,
scheduler=scheduler,
warmup_time_s=warmup_time,
generation_time_s=gen_time,
peak_vram_mb=peak_vram,
cfg_scale=cfg,
)
suite.add(result)
print(f" Generation time: {gen_time:.2f}s")
print(f" Steps/second: {steps / gen_time:.2f}")
print(f" Peak VRAM: {peak_vram:.0f} MB")
def main():
print("="*60)
print("LightDiffusion-Next Performance Benchmark")
print("="*60)
# Ensure output directory
os.makedirs("./output", exist_ok=True)
os.makedirs("./tests", exist_ok=True)
suite = BenchmarkSuite(system_info=get_system_info())
print("\nSystem Info:")
for k, v in suite.system_info.items():
print(f" {k}: {v}")
# Run benchmarks
benchmark_sd15(suite, warmup=True)
benchmark_sdxl(suite, warmup=True)
benchmark_flux2_klein(suite, warmup=True)
# Summary
print(f"\n{'='*60}")
print("BENCHMARK SUMMARY")
print(f"{'='*60}")
print(f"{'Model':<20} {'Resolution':<12} {'Steps':<6} {'Time (s)':<10} {'Steps/s':<10} {'VRAM (MB)':<10}")
print("-" * 80)
for r in suite.results:
steps_per_s = r.steps / r.generation_time_s if r.generation_time_s > 0 else 0
print(f"{r.model_type:<20} {f'{r.resolution[0]}x{r.resolution[1]}':<12} {r.steps:<6} {r.generation_time_s:<10.2f} {steps_per_s:<10.2f} {r.peak_vram_mb:<10.0f}")
# Save results
suite.to_json("./tests/benchmark_results.json")
print("\nBenchmark complete!")
if __name__ == "__main__":
main()