Spaces:

Aatricks
/

LightDiffusion-Next

Running on Zero

App Files Files Community

LightDiffusion-Next / tests /benchmark_performance.py

Aatricks

Deploy ZeroGPU Gradio Space snapshot

b701455 22 days ago

raw

history blame contribute delete

10.4 kB

	"""Performance benchmark comparing LightDiffusion-Next across SD1.5, SDXL, and Flux2 Klein 4B.

	This script measures:
	- Sampling time (model inference + denoising loop)
	- Total generation time (includes VAE decode, model load)
	- VRAM usage peak

	Usage:
	cd LightDiffusion-Next
	python tests/benchmark_performance.py
	"""

	import gc
	import json
	import os
	import sys
	import time
	from dataclasses import asdict, dataclass, field
	from pathlib import Path
	from typing import Optional

	import torch

	# Add project root to path
	project_root = Path(__file__).resolve().parent.parent
	sys.path.insert(0, str(project_root))


	@dataclass
	class BenchmarkResult:
	model_type: str
	model_name: str
	resolution: tuple[int, int]
	steps: int
	sampler: str
	scheduler: str
	warmup_time_s: float = 0.0
	generation_time_s: float = 0.0
	sampling_time_s: float = 0.0
	vae_decode_time_s: float = 0.0
	peak_vram_mb: float = 0.0
	batch_size: int = 1
	cfg_scale: float = 7.0
	notes: str = ""


	@dataclass
	class BenchmarkSuite:
	results: list[BenchmarkResult] = field(default_factory=list)
	system_info: dict = field(default_factory=dict)

	def add(self, result: BenchmarkResult):
	self.results.append(result)

	def to_json(self, path: str):
	data = {
	"system_info": self.system_info,
	"results": [asdict(r) for r in self.results]
	}
	with open(path, "w") as f:
	json.dump(data, f, indent=2)
	print(f"Results saved to {path}")


	def get_system_info() -> dict:
	"""Collect system information for benchmark context."""
	info = {
	"python_version": sys.version,
	"torch_version": torch.__version__,
	"cuda_available": torch.cuda.is_available(),
	}
	if torch.cuda.is_available():
	info["cuda_version"] = torch.version.cuda
	info["gpu_name"] = torch.cuda.get_device_name(0)
	info["gpu_vram_total_mb"] = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2)
	return info


	def reset_cuda():
	"""Clear CUDA cache for accurate VRAM measurement."""
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.reset_peak_memory_stats()
	gc.collect()


	def get_peak_vram_mb() -> float:
	"""Get peak VRAM usage in MB."""
	if torch.cuda.is_available():
	return torch.cuda.max_memory_allocated() / (1024 ** 2)
	return 0.0


	def benchmark_sd15(suite: BenchmarkSuite, warmup: bool = True):
	"""Benchmark SD1.5 with DreamShaper 8."""
	from src.user.pipeline import pipeline

	model_path = "./include/checkpoints/DreamShaper_8_pruned.safetensors"
	if not Path(model_path).exists():
	print(f" Skipping SD1.5 - model not found: {model_path}")
	return

	prompt = "a beautiful sunset over a mountain landscape, high quality photograph"
	w, h = 512, 512
	steps = 20
	sampler = "euler"
	scheduler = "normal"

	print(f"\n{'='*60}")
	print(f"SD1.5 Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}")
	print(f"{'='*60}")

	# Warmup run
	warmup_time = 0.0
	if warmup:
	print(" Warmup run...")
	reset_cuda()
	t0 = time.perf_counter()
	pipeline(
	prompt=prompt,
	w=w, h=h,
	steps=5,
	sampler=sampler,
	scheduler=scheduler,
	model_path=model_path,
	hires_fix=False,
	adetailer=False,
	autohdr=False,
	enable_multiscale=False,
	)
	warmup_time = time.perf_counter() - t0
	print(f" Warmup done in {warmup_time:.2f}s")

	# Timed run
	print(" Benchmark run...")
	reset_cuda()
	t0 = time.perf_counter()
	pipeline(
	prompt=prompt,
	w=w, h=h,
	steps=steps,
	sampler=sampler,
	scheduler=scheduler,
	model_path=model_path,
	hires_fix=False,
	adetailer=False,
	autohdr=False,
	enable_multiscale=False,
	)
	gen_time = time.perf_counter() - t0
	peak_vram = get_peak_vram_mb()

	result = BenchmarkResult(
	model_type="SD1.5",
	model_name="DreamShaper_8",
	resolution=(w, h),
	steps=steps,
	sampler=sampler,
	scheduler=scheduler,
	warmup_time_s=warmup_time,
	generation_time_s=gen_time,
	peak_vram_mb=peak_vram,
	)
	suite.add(result)

	print(f" Generation time: {gen_time:.2f}s")
	print(f" Steps/second: {steps / gen_time:.2f}")
	print(f" Peak VRAM: {peak_vram:.0f} MB")


	def benchmark_sdxl(suite: BenchmarkSuite, warmup: bool = True):
	"""Benchmark SDXL with Juggernaut-XL v9."""
	from src.user.pipeline import pipeline

	model_path = "./include/checkpoints/Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors"
	if not Path(model_path).exists():
	print(f" Skipping SDXL - model not found: {model_path}")
	return

	prompt = "a beautiful sunset over a mountain landscape, high quality photograph"
	w, h = 1024, 1024
	steps = 20
	sampler = "euler"
	scheduler = "ays" # AYS is commonly used for SDXL

	print(f"\n{'='*60}")
	print(f"SDXL Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}")
	print(f"{'='*60}")

	# Warmup run
	warmup_time = 0.0
	if warmup:
	print(" Warmup run...")
	reset_cuda()
	t0 = time.perf_counter()
	pipeline(
	prompt=prompt,
	w=w, h=h,
	steps=5,
	sampler=sampler,
	scheduler=scheduler,
	model_path=model_path,
	hires_fix=False,
	adetailer=False,
	autohdr=False,
	enable_multiscale=False,
	)
	warmup_time = time.perf_counter() - t0
	print(f" Warmup done in {warmup_time:.2f}s")

	# Timed run
	print(" Benchmark run...")
	reset_cuda()
	t0 = time.perf_counter()
	pipeline(
	prompt=prompt,
	w=w, h=h,
	steps=steps,
	sampler=sampler,
	scheduler=scheduler,
	model_path=model_path,
	hires_fix=False,
	adetailer=False,
	autohdr=False,
	enable_multiscale=False,
	)
	gen_time = time.perf_counter() - t0
	peak_vram = get_peak_vram_mb()

	result = BenchmarkResult(
	model_type="SDXL",
	model_name="Juggernaut-XL_v9",
	resolution=(w, h),
	steps=steps,
	sampler=sampler,
	scheduler=scheduler,
	warmup_time_s=warmup_time,
	generation_time_s=gen_time,
	peak_vram_mb=peak_vram,
	)
	suite.add(result)

	print(f" Generation time: {gen_time:.2f}s")
	print(f" Steps/second: {steps / gen_time:.2f}")
	print(f" Peak VRAM: {peak_vram:.0f} MB")


	def benchmark_flux2_klein(suite: BenchmarkSuite, warmup: bool = True):
	"""Benchmark Flux2 Klein 4B."""
	from src.user.pipeline import pipeline

	model_path = "__FLUX2_KLEIN__" # Special marker for Flux2 Klein
	diffusion_path = "./include/diffusion_model/flux-2-klein-4b.safetensors"

	if not Path(diffusion_path).exists():
	print(f" Skipping Flux2 Klein - model not found: {diffusion_path}")
	return

	prompt = "a beautiful sunset over a mountain landscape"
	w, h = 1024, 1024
	steps = 4 # Flux2 Klein is distilled, uses fewer steps
	sampler = "euler"
	scheduler = "simple"
	cfg = 1.0 # Distilled models use CFG=1

	print(f"\n{'='*60}")
	print(f"Flux2 Klein Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}")
	print(f"{'='*60}")

	# Warmup run
	warmup_time = 0.0
	if warmup:
	print(" Warmup run...")
	reset_cuda()
	t0 = time.perf_counter()
	pipeline(
	prompt=prompt,
	w=w, h=h,
	steps=2,
	sampler=sampler,
	scheduler=scheduler,
	model_path=model_path,
	cfg_scale=cfg,
	hires_fix=False,
	adetailer=False,
	autohdr=False,
	enable_multiscale=False,
	)
	warmup_time = time.perf_counter() - t0
	print(f" Warmup done in {warmup_time:.2f}s")

	# Timed run
	print(" Benchmark run...")
	reset_cuda()
	t0 = time.perf_counter()
	pipeline(
	prompt=prompt,
	w=w, h=h,
	steps=steps,
	sampler=sampler,
	scheduler=scheduler,
	model_path=model_path,
	cfg_scale=cfg,
	hires_fix=False,
	adetailer=False,
	autohdr=False,
	enable_multiscale=False,
	)
	gen_time = time.perf_counter() - t0
	peak_vram = get_peak_vram_mb()

	result = BenchmarkResult(
	model_type="Flux2",
	model_name="flux-2-klein-4b",
	resolution=(w, h),
	steps=steps,
	sampler=sampler,
	scheduler=scheduler,
	warmup_time_s=warmup_time,
	generation_time_s=gen_time,
	peak_vram_mb=peak_vram,
	cfg_scale=cfg,
	)
	suite.add(result)

	print(f" Generation time: {gen_time:.2f}s")
	print(f" Steps/second: {steps / gen_time:.2f}")
	print(f" Peak VRAM: {peak_vram:.0f} MB")


	def main():
	print("="*60)
	print("LightDiffusion-Next Performance Benchmark")
	print("="*60)

	# Ensure output directory
	os.makedirs("./output", exist_ok=True)
	os.makedirs("./tests", exist_ok=True)

	suite = BenchmarkSuite(system_info=get_system_info())

	print("\nSystem Info:")
	for k, v in suite.system_info.items():
	print(f" {k}: {v}")

	# Run benchmarks
	benchmark_sd15(suite, warmup=True)
	benchmark_sdxl(suite, warmup=True)
	benchmark_flux2_klein(suite, warmup=True)

	# Summary
	print(f"\n{'='*60}")
	print("BENCHMARK SUMMARY")
	print(f"{'='*60}")
	print(f"{'Model':<20} {'Resolution':<12} {'Steps':<6} {'Time (s)':<10} {'Steps/s':<10} {'VRAM (MB)':<10}")
	print("-" * 80)
	for r in suite.results:
	steps_per_s = r.steps / r.generation_time_s if r.generation_time_s > 0 else 0
	print(f"{r.model_type:<20} {f'{r.resolution[0]}x{r.resolution[1]}':<12} {r.steps:<6} {r.generation_time_s:<10.2f} {steps_per_s:<10.2f} {r.peak_vram_mb:<10.0f}")

	# Save results
	suite.to_json("./tests/benchmark_results.json")

	print("\nBenchmark complete!")


	if __name__ == "__main__":
	main()