Spaces:
Running on Zero
Running on Zero
| """Performance benchmark comparing LightDiffusion-Next across SD1.5, SDXL, and Flux2 Klein 4B. | |
| This script measures: | |
| - Sampling time (model inference + denoising loop) | |
| - Total generation time (includes VAE decode, model load) | |
| - VRAM usage peak | |
| Usage: | |
| cd LightDiffusion-Next | |
| python tests/benchmark_performance.py | |
| """ | |
| import gc | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from dataclasses import asdict, dataclass, field | |
| from pathlib import Path | |
| from typing import Optional | |
| import torch | |
| # Add project root to path | |
| project_root = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(project_root)) | |
| class BenchmarkResult: | |
| model_type: str | |
| model_name: str | |
| resolution: tuple[int, int] | |
| steps: int | |
| sampler: str | |
| scheduler: str | |
| warmup_time_s: float = 0.0 | |
| generation_time_s: float = 0.0 | |
| sampling_time_s: float = 0.0 | |
| vae_decode_time_s: float = 0.0 | |
| peak_vram_mb: float = 0.0 | |
| batch_size: int = 1 | |
| cfg_scale: float = 7.0 | |
| notes: str = "" | |
| class BenchmarkSuite: | |
| results: list[BenchmarkResult] = field(default_factory=list) | |
| system_info: dict = field(default_factory=dict) | |
| def add(self, result: BenchmarkResult): | |
| self.results.append(result) | |
| def to_json(self, path: str): | |
| data = { | |
| "system_info": self.system_info, | |
| "results": [asdict(r) for r in self.results] | |
| } | |
| with open(path, "w") as f: | |
| json.dump(data, f, indent=2) | |
| print(f"Results saved to {path}") | |
| def get_system_info() -> dict: | |
| """Collect system information for benchmark context.""" | |
| info = { | |
| "python_version": sys.version, | |
| "torch_version": torch.__version__, | |
| "cuda_available": torch.cuda.is_available(), | |
| } | |
| if torch.cuda.is_available(): | |
| info["cuda_version"] = torch.version.cuda | |
| info["gpu_name"] = torch.cuda.get_device_name(0) | |
| info["gpu_vram_total_mb"] = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2) | |
| return info | |
| def reset_cuda(): | |
| """Clear CUDA cache for accurate VRAM measurement.""" | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| torch.cuda.reset_peak_memory_stats() | |
| gc.collect() | |
| def get_peak_vram_mb() -> float: | |
| """Get peak VRAM usage in MB.""" | |
| if torch.cuda.is_available(): | |
| return torch.cuda.max_memory_allocated() / (1024 ** 2) | |
| return 0.0 | |
| def benchmark_sd15(suite: BenchmarkSuite, warmup: bool = True): | |
| """Benchmark SD1.5 with DreamShaper 8.""" | |
| from src.user.pipeline import pipeline | |
| model_path = "./include/checkpoints/DreamShaper_8_pruned.safetensors" | |
| if not Path(model_path).exists(): | |
| print(f" Skipping SD1.5 - model not found: {model_path}") | |
| return | |
| prompt = "a beautiful sunset over a mountain landscape, high quality photograph" | |
| w, h = 512, 512 | |
| steps = 20 | |
| sampler = "euler" | |
| scheduler = "normal" | |
| print(f"\n{'='*60}") | |
| print(f"SD1.5 Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}") | |
| print(f"{'='*60}") | |
| # Warmup run | |
| warmup_time = 0.0 | |
| if warmup: | |
| print(" Warmup run...") | |
| reset_cuda() | |
| t0 = time.perf_counter() | |
| pipeline( | |
| prompt=prompt, | |
| w=w, h=h, | |
| steps=5, | |
| sampler=sampler, | |
| scheduler=scheduler, | |
| model_path=model_path, | |
| hires_fix=False, | |
| adetailer=False, | |
| autohdr=False, | |
| enable_multiscale=False, | |
| ) | |
| warmup_time = time.perf_counter() - t0 | |
| print(f" Warmup done in {warmup_time:.2f}s") | |
| # Timed run | |
| print(" Benchmark run...") | |
| reset_cuda() | |
| t0 = time.perf_counter() | |
| pipeline( | |
| prompt=prompt, | |
| w=w, h=h, | |
| steps=steps, | |
| sampler=sampler, | |
| scheduler=scheduler, | |
| model_path=model_path, | |
| hires_fix=False, | |
| adetailer=False, | |
| autohdr=False, | |
| enable_multiscale=False, | |
| ) | |
| gen_time = time.perf_counter() - t0 | |
| peak_vram = get_peak_vram_mb() | |
| result = BenchmarkResult( | |
| model_type="SD1.5", | |
| model_name="DreamShaper_8", | |
| resolution=(w, h), | |
| steps=steps, | |
| sampler=sampler, | |
| scheduler=scheduler, | |
| warmup_time_s=warmup_time, | |
| generation_time_s=gen_time, | |
| peak_vram_mb=peak_vram, | |
| ) | |
| suite.add(result) | |
| print(f" Generation time: {gen_time:.2f}s") | |
| print(f" Steps/second: {steps / gen_time:.2f}") | |
| print(f" Peak VRAM: {peak_vram:.0f} MB") | |
| def benchmark_sdxl(suite: BenchmarkSuite, warmup: bool = True): | |
| """Benchmark SDXL with Juggernaut-XL v9.""" | |
| from src.user.pipeline import pipeline | |
| model_path = "./include/checkpoints/Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors" | |
| if not Path(model_path).exists(): | |
| print(f" Skipping SDXL - model not found: {model_path}") | |
| return | |
| prompt = "a beautiful sunset over a mountain landscape, high quality photograph" | |
| w, h = 1024, 1024 | |
| steps = 20 | |
| sampler = "euler" | |
| scheduler = "ays" # AYS is commonly used for SDXL | |
| print(f"\n{'='*60}") | |
| print(f"SDXL Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}") | |
| print(f"{'='*60}") | |
| # Warmup run | |
| warmup_time = 0.0 | |
| if warmup: | |
| print(" Warmup run...") | |
| reset_cuda() | |
| t0 = time.perf_counter() | |
| pipeline( | |
| prompt=prompt, | |
| w=w, h=h, | |
| steps=5, | |
| sampler=sampler, | |
| scheduler=scheduler, | |
| model_path=model_path, | |
| hires_fix=False, | |
| adetailer=False, | |
| autohdr=False, | |
| enable_multiscale=False, | |
| ) | |
| warmup_time = time.perf_counter() - t0 | |
| print(f" Warmup done in {warmup_time:.2f}s") | |
| # Timed run | |
| print(" Benchmark run...") | |
| reset_cuda() | |
| t0 = time.perf_counter() | |
| pipeline( | |
| prompt=prompt, | |
| w=w, h=h, | |
| steps=steps, | |
| sampler=sampler, | |
| scheduler=scheduler, | |
| model_path=model_path, | |
| hires_fix=False, | |
| adetailer=False, | |
| autohdr=False, | |
| enable_multiscale=False, | |
| ) | |
| gen_time = time.perf_counter() - t0 | |
| peak_vram = get_peak_vram_mb() | |
| result = BenchmarkResult( | |
| model_type="SDXL", | |
| model_name="Juggernaut-XL_v9", | |
| resolution=(w, h), | |
| steps=steps, | |
| sampler=sampler, | |
| scheduler=scheduler, | |
| warmup_time_s=warmup_time, | |
| generation_time_s=gen_time, | |
| peak_vram_mb=peak_vram, | |
| ) | |
| suite.add(result) | |
| print(f" Generation time: {gen_time:.2f}s") | |
| print(f" Steps/second: {steps / gen_time:.2f}") | |
| print(f" Peak VRAM: {peak_vram:.0f} MB") | |
| def benchmark_flux2_klein(suite: BenchmarkSuite, warmup: bool = True): | |
| """Benchmark Flux2 Klein 4B.""" | |
| from src.user.pipeline import pipeline | |
| model_path = "__FLUX2_KLEIN__" # Special marker for Flux2 Klein | |
| diffusion_path = "./include/diffusion_model/flux-2-klein-4b.safetensors" | |
| if not Path(diffusion_path).exists(): | |
| print(f" Skipping Flux2 Klein - model not found: {diffusion_path}") | |
| return | |
| prompt = "a beautiful sunset over a mountain landscape" | |
| w, h = 1024, 1024 | |
| steps = 4 # Flux2 Klein is distilled, uses fewer steps | |
| sampler = "euler" | |
| scheduler = "simple" | |
| cfg = 1.0 # Distilled models use CFG=1 | |
| print(f"\n{'='*60}") | |
| print(f"Flux2 Klein Benchmark: {w}x{h}, {steps} steps, {sampler}/{scheduler}") | |
| print(f"{'='*60}") | |
| # Warmup run | |
| warmup_time = 0.0 | |
| if warmup: | |
| print(" Warmup run...") | |
| reset_cuda() | |
| t0 = time.perf_counter() | |
| pipeline( | |
| prompt=prompt, | |
| w=w, h=h, | |
| steps=2, | |
| sampler=sampler, | |
| scheduler=scheduler, | |
| model_path=model_path, | |
| cfg_scale=cfg, | |
| hires_fix=False, | |
| adetailer=False, | |
| autohdr=False, | |
| enable_multiscale=False, | |
| ) | |
| warmup_time = time.perf_counter() - t0 | |
| print(f" Warmup done in {warmup_time:.2f}s") | |
| # Timed run | |
| print(" Benchmark run...") | |
| reset_cuda() | |
| t0 = time.perf_counter() | |
| pipeline( | |
| prompt=prompt, | |
| w=w, h=h, | |
| steps=steps, | |
| sampler=sampler, | |
| scheduler=scheduler, | |
| model_path=model_path, | |
| cfg_scale=cfg, | |
| hires_fix=False, | |
| adetailer=False, | |
| autohdr=False, | |
| enable_multiscale=False, | |
| ) | |
| gen_time = time.perf_counter() - t0 | |
| peak_vram = get_peak_vram_mb() | |
| result = BenchmarkResult( | |
| model_type="Flux2", | |
| model_name="flux-2-klein-4b", | |
| resolution=(w, h), | |
| steps=steps, | |
| sampler=sampler, | |
| scheduler=scheduler, | |
| warmup_time_s=warmup_time, | |
| generation_time_s=gen_time, | |
| peak_vram_mb=peak_vram, | |
| cfg_scale=cfg, | |
| ) | |
| suite.add(result) | |
| print(f" Generation time: {gen_time:.2f}s") | |
| print(f" Steps/second: {steps / gen_time:.2f}") | |
| print(f" Peak VRAM: {peak_vram:.0f} MB") | |
| def main(): | |
| print("="*60) | |
| print("LightDiffusion-Next Performance Benchmark") | |
| print("="*60) | |
| # Ensure output directory | |
| os.makedirs("./output", exist_ok=True) | |
| os.makedirs("./tests", exist_ok=True) | |
| suite = BenchmarkSuite(system_info=get_system_info()) | |
| print("\nSystem Info:") | |
| for k, v in suite.system_info.items(): | |
| print(f" {k}: {v}") | |
| # Run benchmarks | |
| benchmark_sd15(suite, warmup=True) | |
| benchmark_sdxl(suite, warmup=True) | |
| benchmark_flux2_klein(suite, warmup=True) | |
| # Summary | |
| print(f"\n{'='*60}") | |
| print("BENCHMARK SUMMARY") | |
| print(f"{'='*60}") | |
| print(f"{'Model':<20} {'Resolution':<12} {'Steps':<6} {'Time (s)':<10} {'Steps/s':<10} {'VRAM (MB)':<10}") | |
| print("-" * 80) | |
| for r in suite.results: | |
| steps_per_s = r.steps / r.generation_time_s if r.generation_time_s > 0 else 0 | |
| print(f"{r.model_type:<20} {f'{r.resolution[0]}x{r.resolution[1]}':<12} {r.steps:<6} {r.generation_time_s:<10.2f} {steps_per_s:<10.2f} {r.peak_vram_mb:<10.0f}") | |
| # Save results | |
| suite.to_json("./tests/benchmark_results.json") | |
| print("\nBenchmark complete!") | |
| if __name__ == "__main__": | |
| main() | |