"""Performance benchmark comparing LightDiffusion-Next across SD1.5, SDXL, and Flux2 Klein 4B. This script measures: - Sampling time (model inference + denoising loop) - Total generation time (includes VAE decode, model load) - VRAM usage peak Usage: cd LightDiffusion-Next python tests/benchmark_performance.py """ import gc import json import os import sys import time from dataclasses import asdict, dataclass, field from pathlib import Path from typing import Optional import torch # Add project root to path project_root = Path(__file__).resolve().parent.parent sys.path.insert(0, str(project_root)) @dataclass class BenchmarkResult: model_type: str model_name: str resolution: tuple[int, int] steps: int sampler: str scheduler: str warmup_time_s: float = 0.0 generation_time_s: float = 0.0 sampling_time_s: float = 0.0 vae_decode_time_s: float = 0.0 peak_vram_mb: float = 0.0 batch_size: int = 1 cfg_scale: float = 7.0 notes: str = "" @dataclass class BenchmarkSuite: results: list[BenchmarkResult] = field(default_factory=list) system_info: dict = field(default_factory=dict) def add(self, result: BenchmarkResult): self.results.append(result) def to_json(self, path: str): data = { "system_info": self.system_info, "results": [asdict(r) for r in self.results] } with open(path, "w") as f: json.dump(data, f, indent=2) print(f"Results saved to {path}") def get_system_info() -> dict: """Collect system information for benchmark context.""" info = { "python_version": sys.version, "torch_version": torch.__version__, "cuda_available": torch.cuda.is_available(), } if torch.cuda.is_available(): info["cuda_version"] = torch.version.cuda info["gpu_name"] = torch.cuda.get_device_name(0) info["gpu_vram_total_mb"] = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2) return info def reset_cuda(): """Clear CUDA cache for accurate VRAM measurement.""" if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() gc.collect() def get_peak_vram_mb() -> float: """Get peak VRAM usage in MB.""" if torch.cuda.is_available(): return torch.cuda.max_memory_allocated() / (1024 ** 2) return 0.0 def benchmark_sd15(suite: BenchmarkSuite, warmup: bool = True): """Benchmark SD1.5 with DreamShaper 8.""" from src.user.pipeline import pipeline model_path = "./include/checkpoints/DreamShaper_8_pruned.safetensors" if not Path(model_path).exists(): print(f" Skipping SD1.5 - model not found: {model_path}") return prompt = "a beautiful sunset over a mountain landscape, high quality photograph" w, h = 512, 512 steps = 20 sampler = "euler" scheduler = "normal" print(f"\n{'='*60}") print(f"SD1.5 Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}") print(f"{'='*60}") # Warmup run warmup_time = 0.0 if warmup: print(" Warmup run...") reset_cuda() t0 = time.perf_counter() pipeline( prompt=prompt, w=w, h=h, steps=5, sampler=sampler, scheduler=scheduler, model_path=model_path, hires_fix=False, adetailer=False, autohdr=False, enable_multiscale=False, ) warmup_time = time.perf_counter() - t0 print(f" Warmup done in {warmup_time:.2f}s") # Timed run print(" Benchmark run...") reset_cuda() t0 = time.perf_counter() pipeline( prompt=prompt, w=w, h=h, steps=steps, sampler=sampler, scheduler=scheduler, model_path=model_path, hires_fix=False, adetailer=False, autohdr=False, enable_multiscale=False, ) gen_time = time.perf_counter() - t0 peak_vram = get_peak_vram_mb() result = BenchmarkResult( model_type="SD1.5", model_name="DreamShaper_8", resolution=(w, h), steps=steps, sampler=sampler, scheduler=scheduler, warmup_time_s=warmup_time, generation_time_s=gen_time, peak_vram_mb=peak_vram, ) suite.add(result) print(f" Generation time: {gen_time:.2f}s") print(f" Steps/second: {steps / gen_time:.2f}") print(f" Peak VRAM: {peak_vram:.0f} MB") def benchmark_sdxl(suite: BenchmarkSuite, warmup: bool = True): """Benchmark SDXL with Juggernaut-XL v9.""" from src.user.pipeline import pipeline model_path = "./include/checkpoints/Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors" if not Path(model_path).exists(): print(f" Skipping SDXL - model not found: {model_path}") return prompt = "a beautiful sunset over a mountain landscape, high quality photograph" w, h = 1024, 1024 steps = 20 sampler = "euler" scheduler = "ays" # AYS is commonly used for SDXL print(f"\n{'='*60}") print(f"SDXL Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}") print(f"{'='*60}") # Warmup run warmup_time = 0.0 if warmup: print(" Warmup run...") reset_cuda() t0 = time.perf_counter() pipeline( prompt=prompt, w=w, h=h, steps=5, sampler=sampler, scheduler=scheduler, model_path=model_path, hires_fix=False, adetailer=False, autohdr=False, enable_multiscale=False, ) warmup_time = time.perf_counter() - t0 print(f" Warmup done in {warmup_time:.2f}s") # Timed run print(" Benchmark run...") reset_cuda() t0 = time.perf_counter() pipeline( prompt=prompt, w=w, h=h, steps=steps, sampler=sampler, scheduler=scheduler, model_path=model_path, hires_fix=False, adetailer=False, autohdr=False, enable_multiscale=False, ) gen_time = time.perf_counter() - t0 peak_vram = get_peak_vram_mb() result = BenchmarkResult( model_type="SDXL", model_name="Juggernaut-XL_v9", resolution=(w, h), steps=steps, sampler=sampler, scheduler=scheduler, warmup_time_s=warmup_time, generation_time_s=gen_time, peak_vram_mb=peak_vram, ) suite.add(result) print(f" Generation time: {gen_time:.2f}s") print(f" Steps/second: {steps / gen_time:.2f}") print(f" Peak VRAM: {peak_vram:.0f} MB") def benchmark_flux2_klein(suite: BenchmarkSuite, warmup: bool = True): """Benchmark Flux2 Klein 4B.""" from src.user.pipeline import pipeline model_path = "__FLUX2_KLEIN__" # Special marker for Flux2 Klein diffusion_path = "./include/diffusion_model/flux-2-klein-4b.safetensors" if not Path(diffusion_path).exists(): print(f" Skipping Flux2 Klein - model not found: {diffusion_path}") return prompt = "a beautiful sunset over a mountain landscape" w, h = 1024, 1024 steps = 4 # Flux2 Klein is distilled, uses fewer steps sampler = "euler" scheduler = "simple" cfg = 1.0 # Distilled models use CFG=1 print(f"\n{'='*60}") print(f"Flux2 Klein Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}") print(f"{'='*60}") # Warmup run warmup_time = 0.0 if warmup: print(" Warmup run...") reset_cuda() t0 = time.perf_counter() pipeline( prompt=prompt, w=w, h=h, steps=2, sampler=sampler, scheduler=scheduler, model_path=model_path, cfg_scale=cfg, hires_fix=False, adetailer=False, autohdr=False, enable_multiscale=False, ) warmup_time = time.perf_counter() - t0 print(f" Warmup done in {warmup_time:.2f}s") # Timed run print(" Benchmark run...") reset_cuda() t0 = time.perf_counter() pipeline( prompt=prompt, w=w, h=h, steps=steps, sampler=sampler, scheduler=scheduler, model_path=model_path, cfg_scale=cfg, hires_fix=False, adetailer=False, autohdr=False, enable_multiscale=False, ) gen_time = time.perf_counter() - t0 peak_vram = get_peak_vram_mb() result = BenchmarkResult( model_type="Flux2", model_name="flux-2-klein-4b", resolution=(w, h), steps=steps, sampler=sampler, scheduler=scheduler, warmup_time_s=warmup_time, generation_time_s=gen_time, peak_vram_mb=peak_vram, cfg_scale=cfg, ) suite.add(result) print(f" Generation time: {gen_time:.2f}s") print(f" Steps/second: {steps / gen_time:.2f}") print(f" Peak VRAM: {peak_vram:.0f} MB") def main(): print("="*60) print("LightDiffusion-Next Performance Benchmark") print("="*60) # Ensure output directory os.makedirs("./output", exist_ok=True) os.makedirs("./tests", exist_ok=True) suite = BenchmarkSuite(system_info=get_system_info()) print("\nSystem Info:") for k, v in suite.system_info.items(): print(f" {k}: {v}") # Run benchmarks benchmark_sd15(suite, warmup=True) benchmark_sdxl(suite, warmup=True) benchmark_flux2_klein(suite, warmup=True) # Summary print(f"\n{'='*60}") print("BENCHMARK SUMMARY") print(f"{'='*60}") print(f"{'Model':<20} {'Resolution':<12} {'Steps':<6} {'Time (s)':<10} {'Steps/s':<10} {'VRAM (MB)':<10}") print("-" * 80) for r in suite.results: steps_per_s = r.steps / r.generation_time_s if r.generation_time_s > 0 else 0 print(f"{r.model_type:<20} {f'{r.resolution[0]}x{r.resolution[1]}':<12} {r.steps:<6} {r.generation_time_s:<10.2f} {steps_per_s:<10.2f} {r.peak_vram_mb:<10.0f}") # Save results suite.to_json("./tests/benchmark_results.json") print("\nBenchmark complete!") if __name__ == "__main__": main()