#!/usr/bin/env python3 # Copyright (c) 2025 Delanoe Pirard / Aedelon - Apache 2.0 """ Full Benchmark Suite for Depth Anything 3 Tests ALL optimization combinations for each device (CPU, MPS, CUDA). Optimizations tested: - Preprocessing: CPU (PIL) vs GPU (NVJPEG on CUDA) - Attention: SDPA (Flash Attention) vs Manual Usage: python benchmarks/full_benchmark.py # Best device only python benchmarks/full_benchmark.py -d all # All devices python benchmarks/full_benchmark.py -d cuda # CUDA only python benchmarks/full_benchmark.py --quick # Quick mode """ import argparse import gc import logging import os import shutil import sys import time import warnings from dataclasses import dataclass from typing import Dict, List, Optional # Suppress ALL logging before any imports logging.disable(logging.CRITICAL) os.environ["DA3_LOG_LEVEL"] = "ERROR" warnings.filterwarnings("ignore") import numpy as np import torch from PIL import Image sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) # Suppress depth_anything_3 logger specifically logging.getLogger("depth_anything_3").disabled = True logging.getLogger("dinov2").disabled = True # ============================================================================ # STYLES # ============================================================================ class Style: CYAN = "\033[96m" GREEN = "\033[92m" YELLOW = "\033[93m" RED = "\033[91m" BOLD = "\033[1m" DIM = "\033[2m" RESET = "\033[0m" def colored(text, color, bold=False): prefix = Style.BOLD if bold else "" return f"{prefix}{color}{text}{Style.RESET}" # ============================================================================ # UTILITIES # ============================================================================ def cleanup(): gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() if torch.backends.mps.is_available(): torch.mps.empty_cache() def sync_device(device): if device.type == "cuda": torch.cuda.synchronize() elif device.type == "mps": torch.mps.synchronize() def get_available_devices() -> List[torch.device]: """Get all available devices for benchmarking.""" devices = [torch.device("cpu")] if torch.backends.mps.is_available(): devices.append(torch.device("mps")) if torch.cuda.is_available(): devices.append(torch.device("cuda")) return devices def get_device_name(device: torch.device) -> str: """Get human-readable device name.""" if device.type == "cuda": return torch.cuda.get_device_name(device) elif device.type == "mps": return "Apple Silicon (MPS)" else: import platform return f"CPU ({platform.processor() or 'Unknown'})" # ============================================================================ # DATA CLASSES # ============================================================================ @dataclass class BenchmarkResult: """Single benchmark result.""" mean_ms: float std_ms: float fps: float @classmethod def from_times(cls, times: List[float], batch_size: int = 1): mean_ms = np.mean(times) std_ms = np.std(times) fps = 1000 / mean_ms * batch_size return cls(mean_ms=mean_ms, std_ms=std_ms, fps=fps) @dataclass class OptimizationConfig: """Configuration for a specific optimization combination.""" name: str preprocessing: str # "cpu" or "gpu" attention: str # "sdpa" or "manual" description: str @property def short_name(self) -> str: prep = "GPU" if self.preprocessing == "gpu" else "CPU" attn = "SDPA" if self.attention == "sdpa" else "Manual" return f"{prep}+{attn}" # ============================================================================ # BENCHMARK FUNCTIONS # ============================================================================ def get_optimization_configs(device: torch.device) -> List[OptimizationConfig]: """Get all valid optimization configurations for a device.""" configs = [] if device.type == "cuda": # CUDA: All 4 combinations configs = [ OptimizationConfig("gpu_sdpa", "gpu", "sdpa", "GPU Decode (NVJPEG) + SDPA (Flash)"), OptimizationConfig("gpu_manual", "gpu", "manual", "GPU Decode (NVJPEG) + Manual Attn"), OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "CPU Decode (PIL) + SDPA (Flash)"), OptimizationConfig("cpu_manual", "cpu", "manual", "CPU Decode (PIL) + Manual Attn"), ] elif device.type == "mps": # MPS: CPU preprocessing is better, 2 combinations configs = [ OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "CPU Decode (PIL) + SDPA"), OptimizationConfig("cpu_manual", "cpu", "manual", "CPU Decode (PIL) + Manual Attn"), ] else: # CPU: 2 combinations configs = [ OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "SDPA Attention"), OptimizationConfig("cpu_manual", "cpu", "manual", "Manual Attention"), ] return configs def benchmark_preprocessing_detailed(device: torch.device, runs: int = 5) -> Dict: """Benchmark preprocessing in detail.""" from depth_anything_3.utils.io.input_processor import InputProcessor from depth_anything_3.utils.io.gpu_input_processor import GPUInputProcessor results = {} temp_dir = "temp_bench_preproc" sizes = [ ("720p", 1280, 720), ("1080p", 1920, 1080), ("4K", 3840, 2160), ] os.makedirs(temp_dir, exist_ok=True) try: cpu_proc = InputProcessor() gpu_proc = None if device.type == "cuda": gpu_proc = GPUInputProcessor(device=device) for name, w, h in sizes: results[name] = {} # Create test files files = [] pil_imgs = [] for i in range(4): img = Image.new("RGB", (w, h), color=(100 + i*10, 150, 200)) fpath = f"{temp_dir}/{name}_{i}.jpg" img.save(fpath, quality=95) files.append(fpath) pil_imgs.append(img.copy()) # CPU benchmark cleanup() for _ in range(2): cpu_proc(image=pil_imgs, process_res=518, num_workers=8) times = [] for _ in range(runs): start = time.perf_counter() cpu_proc(image=pil_imgs, process_res=518, num_workers=8) times.append((time.perf_counter() - start) * 1000) results[name]["cpu"] = BenchmarkResult.from_times(times, batch_size=4) # GPU benchmark (NVJPEG for CUDA) if gpu_proc and gpu_proc.use_gpu: cleanup() for _ in range(2): gpu_proc(image=files, process_res=518, num_workers=1) sync_device(device) times = [] for _ in range(runs): sync_device(device) start = time.perf_counter() gpu_proc(image=files, process_res=518, num_workers=1) sync_device(device) times.append((time.perf_counter() - start) * 1000) results[name]["gpu"] = BenchmarkResult.from_times(times, batch_size=4) finally: shutil.rmtree(temp_dir, ignore_errors=True) return results def benchmark_attention_detailed(device: torch.device, runs: int = 10) -> Dict: """Benchmark attention backends in detail.""" from depth_anything_3.model.dinov2.layers import Attention results = {} dtype = torch.float16 if device.type == "cuda" else torch.float32 configs = [ ("ViT-S (518px)", 384, 6, 529), ("ViT-L (518px)", 1024, 16, 529), ("ViT-L (770px)", 1024, 16, 1156), ] for name, dim, heads, seq_len in configs: results[name] = {} x = torch.randn(1, seq_len, dim, device=device, dtype=dtype) for backend in ["sdpa", "manual"]: cleanup() attn = Attention(dim=dim, num_heads=heads, attn_backend=backend).to(device, dtype) attn.eval() # Warmup with torch.no_grad(): for _ in range(3): attn(x) sync_device(device) # Benchmark times = [] with torch.no_grad(): for _ in range(runs): sync_device(device) start = time.perf_counter() attn(x) sync_device(device) times.append((time.perf_counter() - start) * 1000) results[name][backend] = BenchmarkResult.from_times(times) del attn return results def benchmark_inference_matrix( device: torch.device, models: List[str], runs: int = 3, ) -> Dict: """Benchmark all optimization combinations for inference.""" from depth_anything_3.api import DepthAnything3 results = {} temp_dir = "temp_bench_infer" configs = get_optimization_configs(device) os.makedirs(temp_dir, exist_ok=True) # Create test images (720p) img_paths = [] pil_imgs = [] for i in range(4): img = Image.new("RGB", (1280, 720), color=(100 + i*20, 150, 200)) path = f"{temp_dir}/test_{i}.jpg" img.save(path, quality=95) img_paths.append(path) pil_imgs.append(img.copy()) try: for model_name in models: results[model_name] = {} for config in configs: cleanup() # Set attention backend os.environ["DA3_ATTENTION_BACKEND"] = config.attention # Load model fresh (to apply attention backend) model = DepthAnything3( model_name=model_name, device=device, use_cache=False, ) # Choose input based on preprocessing if config.preprocessing == "gpu" and device.type == "cuda": test_input = img_paths[:1] # File paths for NVJPEG else: test_input = pil_imgs[:1] # PIL for CPU preprocessing # Warmup for _ in range(3): model.inference(test_input, process_res=518) sync_device(device) # Benchmark times = [] for _ in range(runs): sync_device(device) start = time.perf_counter() model.inference(test_input, process_res=518) sync_device(device) times.append((time.perf_counter() - start) * 1000) results[model_name][config.name] = { "result": BenchmarkResult.from_times(times, batch_size=1), "config": config, } del model cleanup() finally: shutil.rmtree(temp_dir, ignore_errors=True) return results # ============================================================================ # DISPLAY FUNCTIONS # ============================================================================ def print_header(title: str): """Print section header.""" print() print(colored("═" * 70, Style.CYAN)) print(colored("║", Style.CYAN) + colored(f" {title}", Style.BOLD).center(77) + colored("║", Style.CYAN)) print(colored("═" * 70, Style.CYAN)) def print_subheader(title: str): """Print subsection header.""" print() print(colored(f"▶ {title}", Style.YELLOW, bold=True)) print(colored("─" * 70, Style.DIM)) def format_speedup(speedup: float) -> str: """Format speedup with color.""" if speedup >= 1.5: return colored(f"{speedup:.2f}x", Style.GREEN, bold=True) elif speedup >= 1.1: return colored(f"{speedup:.2f}x", Style.GREEN) elif speedup >= 0.95: return f"{speedup:.2f}x" else: return colored(f"{speedup:.2f}x", Style.RED) def print_preprocessing_results(results: Dict, device: torch.device): """Print preprocessing benchmark results.""" print_subheader("PREPROCESSING (4 images batch)") has_gpu = any("gpu" in r for r in results.values()) if has_gpu: print(f" {'Resolution':<12} {'CPU (PIL)':<14} {'GPU (NVJPEG)':<14} {'Speedup':<10}") print(f" {'-'*50}") for name, data in results.items(): cpu_ms = data["cpu"].mean_ms if "gpu" in data: gpu_ms = data["gpu"].mean_ms speedup = cpu_ms / gpu_ms print(f" {name:<12} {cpu_ms:>8.1f} ms {gpu_ms:>8.1f} ms {format_speedup(speedup)}") else: print(f" {name:<12} {cpu_ms:>8.1f} ms {'N/A':<14}") else: print(f" {'Resolution':<12} {'CPU (PIL)':<14}") print(f" {'-'*30}") for name, data in results.items(): cpu_ms = data["cpu"].mean_ms print(f" {name:<12} {cpu_ms:>8.1f} ms") # Summary if has_gpu: speedups = [] for data in results.values(): if "gpu" in data: speedups.append(data["cpu"].mean_ms / data["gpu"].mean_ms) if speedups: avg = np.mean(speedups) print() print(f" {colored('→', Style.GREEN)} GPU preprocessing avg {colored(f'{avg:.1f}x', Style.GREEN, bold=True)} faster") def print_attention_results(results: Dict, device: torch.device): """Print attention benchmark results.""" print_subheader("ATTENTION (per layer forward pass)") print(f" {'Config':<18} {'SDPA':<12} {'Manual':<12} {'Speedup':<10}") print(f" {'-'*52}") for name, data in results.items(): sdpa_ms = data["sdpa"].mean_ms manual_ms = data["manual"].mean_ms speedup = manual_ms / sdpa_ms print(f" {name:<18} {sdpa_ms:>6.3f} ms {manual_ms:>6.3f} ms {format_speedup(speedup)}") # Summary speedups = [d["manual"].mean_ms / d["sdpa"].mean_ms for d in results.values()] avg = np.mean(speedups) print() print(f" {colored('→', Style.GREEN)} SDPA avg {colored(f'{avg:.1f}x', Style.GREEN, bold=True)} faster than manual") # Check Flash SDP if device.type == "cuda": from torch.backends.cuda import flash_sdp_enabled if flash_sdp_enabled(): print(f" {colored('→', Style.GREEN)} Flash Attention: {colored('ENABLED', Style.GREEN, bold=True)} (PyTorch native)") def print_inference_matrix(results: Dict, device: torch.device): """Print inference benchmark matrix.""" print_subheader("END-TO-END INFERENCE (720p input, batch=1)") configs = get_optimization_configs(device) # Header header = f" {'Model':<12}" for cfg in configs: header += f" {cfg.short_name:<14}" header += " Best" print(header) print(f" {'-'*(14 + 15*len(configs) + 6)}") # Results per model for model_name, model_results in results.items(): row = f" {model_name:<12}" best_fps = 0 best_config = None worst_fps = float('inf') for cfg in configs: if cfg.name in model_results: result = model_results[cfg.name]["result"] fps = result.fps row += f" {fps:>6.1f} img/s " if fps > best_fps: best_fps = fps best_config = cfg if fps < worst_fps: worst_fps = fps else: row += f" {'N/A':<14}" # Best indicator if best_config: row += f" {colored(best_config.short_name, Style.GREEN, bold=True)}" print(row) # Summary print() print(f" {Style.DIM}Legend: GPU=NVJPEG decode, CPU=PIL decode, SDPA=Flash Attention{Style.RESET}") def print_device_summary( device: torch.device, preproc_results: Dict, attn_results: Dict, infer_results: Dict, ): """Print summary for a device.""" print() print(colored("─" * 70, Style.CYAN)) print(colored(f" {device.type.upper()} - OPTIMIZATION SUMMARY", Style.BOLD)) print(colored("─" * 70, Style.CYAN)) # Best configuration if infer_results: print() print(f" {colored('Best configuration per model:', Style.CYAN)}") for model_name, model_results in infer_results.items(): if not model_results: continue best_name = max(model_results.keys(), key=lambda k: model_results[k]["result"].fps) best = model_results[best_name] worst_name = min(model_results.keys(), key=lambda k: model_results[k]["result"].fps) worst = model_results[worst_name] speedup = best["result"].fps / worst["result"].fps if worst["result"].fps > 0 else 1 print(f" {model_name:<12} {colored(best['config'].description, Style.GREEN)}") print(f" {'':<12} {best['result'].fps:.1f} img/s ({speedup:.1f}x vs worst)") # Recommendations print() print(f" {colored('Recommendations:', Style.CYAN)}") if device.type == "cuda": print(f" ✓ Use {colored('GPU preprocessing (NVJPEG)', Style.GREEN)} for file inputs") print(f" ✓ {colored('SDPA (Flash Attention)', Style.GREEN)} is enabled by default") print(f" ✓ Pass file paths (not PIL images) to leverage NVJPEG") elif device.type == "mps": print(f" ✓ Use {colored('CPU preprocessing', Style.GREEN)} (faster than GPU on MPS)") print(f" ✓ {colored('SDPA', Style.GREEN)} provides moderate speedup") else: print(f" ✓ {colored('SDPA', Style.GREEN)} provides speedup over manual attention") print(f" ○ Consider using GPU (CUDA/MPS) for better performance") # ============================================================================ # MAIN # ============================================================================ def main(): parser = argparse.ArgumentParser( description="DA3 Full Benchmark - Test all optimization combinations", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python benchmarks/full_benchmark.py # Best device only python benchmarks/full_benchmark.py -d all # All devices python benchmarks/full_benchmark.py -d cuda # CUDA only python benchmarks/full_benchmark.py --quick # Quick mode (fewer runs) python benchmarks/full_benchmark.py --models da3-small da3-large """ ) parser.add_argument("--quick", action="store_true", help="Quick mode (fewer runs)") parser.add_argument("--skip-preprocessing", action="store_true", help="Skip preprocessing benchmark") parser.add_argument("--skip-attention", action="store_true", help="Skip attention benchmark") parser.add_argument("--skip-inference", action="store_true", help="Skip inference benchmark") parser.add_argument("-d", "--device", type=str, default="auto", choices=["auto", "cpu", "mps", "cuda", "all"], help="Device to benchmark (default: auto)") parser.add_argument("--models", nargs="+", default=None, help="Models to benchmark (default: all)") args = parser.parse_args() # Configure runs runs_preproc = 3 if args.quick else 5 runs_attn = 5 if args.quick else 10 runs_infer = 2 if args.quick else 4 # Determine models if args.models: models = args.models elif args.quick: models = ["da3-small", "da3-large"] else: models = ["da3-small", "da3-base", "da3-large"] # Determine devices available_devices = get_available_devices() if args.device == "auto": devices_to_test = [available_devices[-1]] # Best available elif args.device == "all": devices_to_test = available_devices else: requested = torch.device(args.device) if requested in available_devices: devices_to_test = [requested] else: print(f"Error: Device '{args.device}' not available.") print(f"Available: {[d.type for d in available_devices]}") return # Main header print() print(colored("╔" + "═" * 68 + "╗", Style.CYAN)) print(colored("║", Style.CYAN) + colored(" DEPTH ANYTHING 3 - FULL BENCHMARK", Style.BOLD).center(77) + colored("║", Style.CYAN)) print(colored("║", Style.CYAN) + colored(" All Optimization Combinations", Style.DIM).center(77) + colored("║", Style.CYAN)) print(colored("╚" + "═" * 68 + "╝", Style.CYAN)) print(f"\n {Style.DIM}PyTorch{Style.RESET} : {colored(torch.__version__, Style.CYAN)}") print(f" {Style.DIM}Models{Style.RESET} : {colored(', '.join(models), Style.CYAN)}") print(f" {Style.DIM}Mode{Style.RESET} : {colored('Quick' if args.quick else 'Full', Style.CYAN)}") print(f"\n {Style.DIM}Available devices:{Style.RESET}") for d in available_devices: status = colored("●", Style.GREEN) if d in devices_to_test else colored("○", Style.DIM) print(f" {status} {d.type.upper():<6} {get_device_name(d)}") all_results = {} # Run benchmarks for each device for device in devices_to_test: device_name = get_device_name(device) all_results[device.type] = {} print_header(f"{device.type.upper()} - {device_name}") # 1. Preprocessing preproc_results = {} if not args.skip_preprocessing and device.type != "cpu": preproc_results = benchmark_preprocessing_detailed(device, runs=runs_preproc) all_results[device.type]["preprocessing"] = preproc_results print_preprocessing_results(preproc_results, device) elif device.type == "cpu": print_subheader("PREPROCESSING") print(f" {Style.DIM}Skipped (CPU only - no GPU comparison){Style.RESET}") # 2. Attention attn_results = {} if not args.skip_attention: attn_results = benchmark_attention_detailed(device, runs=runs_attn) all_results[device.type]["attention"] = attn_results print_attention_results(attn_results, device) # 3. Inference Matrix infer_results = {} if not args.skip_inference: infer_results = benchmark_inference_matrix(device, models, runs=runs_infer) all_results[device.type]["inference"] = infer_results print_inference_matrix(infer_results, device) # Device Summary print_device_summary(device, preproc_results, attn_results, infer_results) cleanup() # Cross-device comparison if len(devices_to_test) > 1 and not args.skip_inference: print_header("CROSS-DEVICE COMPARISON") # Find common model common_model = models[-1] # Usually largest tested print() print(f" {colored(f'{common_model} (best config per device):', Style.CYAN)}") print(f" {'Device':<10} {'Config':<30} {'Performance':<15}") print(f" {'-'*55}") base_fps = None for device in devices_to_test: if device.type in all_results and "inference" in all_results[device.type]: infer = all_results[device.type]["inference"].get(common_model, {}) if infer: best_name = max(infer.keys(), key=lambda k: infer[k]["result"].fps) best = infer[best_name] fps = best["result"].fps if base_fps is None: base_fps = fps speedup = fps / base_fps if base_fps else 1 speedup_str = f"({speedup:.1f}x)" if device != devices_to_test[0] else "(baseline)" print(f" {device.type.upper():<10} {best['config'].description:<30} {fps:>5.1f} img/s {speedup_str}") # Final summary print() print(colored("═" * 70, Style.CYAN)) print(colored("║", Style.CYAN) + colored(" BENCHMARK COMPLETE", Style.BOLD).center(77) + colored("║", Style.CYAN)) print(colored("═" * 70, Style.CYAN)) print() if __name__ == "__main__": main()