|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Comparative Benchmark: awesome-depth-anything-3 vs upstream (vanilla) |
|
|
|
|
|
Compares performance between the optimized fork and the original upstream. |
|
|
|
|
|
Usage: |
|
|
python benchmarks/comparative_benchmark.py --device mps |
|
|
python benchmarks/comparative_benchmark.py --device cuda |
|
|
python benchmarks/comparative_benchmark.py --device all |
|
|
python benchmarks/comparative_benchmark.py --quick |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import contextlib |
|
|
import gc |
|
|
import io |
|
|
import logging |
|
|
import os |
|
|
import shutil |
|
|
import sys |
|
|
import time |
|
|
import warnings |
|
|
|
|
|
|
|
|
logging.disable(logging.CRITICAL) |
|
|
os.environ["DA3_LOG_LEVEL"] = "CRITICAL" |
|
|
os.environ["PYTHONWARNINGS"] = "ignore" |
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
import numpy as np |
|
|
import torch |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
logging.getLogger("depth_anything_3").disabled = True |
|
|
logging.getLogger("dinov2").disabled = True |
|
|
logging.getLogger().setLevel(logging.CRITICAL) |
|
|
|
|
|
|
|
|
@contextlib.contextmanager |
|
|
def suppress_output(): |
|
|
"""Context manager to suppress stdout and stderr.""" |
|
|
with contextlib.redirect_stdout(io.StringIO()), \ |
|
|
contextlib.redirect_stderr(io.StringIO()): |
|
|
|
|
|
logging.disable(logging.CRITICAL) |
|
|
yield |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AWESOME_REPO = "/Users/aedelon/Workspace/awesome-depth-anything-3" |
|
|
UPSTREAM_REPO = "/Users/aedelon/Workspace/depth-anything-3-upstream" |
|
|
MODEL_NAME = "da3-large" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cleanup(): |
|
|
gc.collect() |
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
torch.cuda.reset_peak_memory_stats() |
|
|
if torch.backends.mps.is_available(): |
|
|
torch.mps.empty_cache() |
|
|
|
|
|
|
|
|
def sync_device(device): |
|
|
if device.type == "cuda": |
|
|
torch.cuda.synchronize() |
|
|
elif device.type == "mps": |
|
|
torch.mps.synchronize() |
|
|
|
|
|
|
|
|
def clear_modules(): |
|
|
"""Clear depth_anything_3 from sys.modules.""" |
|
|
to_remove = [k for k in sys.modules.keys() if "depth_anything_3" in k] |
|
|
for k in to_remove: |
|
|
del sys.modules[k] |
|
|
|
|
|
|
|
|
def suppress_logging(): |
|
|
"""Suppress all logging after module import.""" |
|
|
logging.disable(logging.CRITICAL) |
|
|
try: |
|
|
from depth_anything_3.utils.logger import logger |
|
|
logger.level = 100 |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
def get_available_devices(): |
|
|
"""Get available devices.""" |
|
|
devices = [torch.device("cpu")] |
|
|
if torch.backends.mps.is_available(): |
|
|
devices.append(torch.device("mps")) |
|
|
if torch.cuda.is_available(): |
|
|
devices.append(torch.device("cuda")) |
|
|
return devices |
|
|
|
|
|
|
|
|
def get_device_name(device): |
|
|
if device.type == "cuda": |
|
|
return torch.cuda.get_device_name(device) |
|
|
elif device.type == "mps": |
|
|
return "Apple Silicon (MPS)" |
|
|
return "CPU" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def benchmark_upstream(device, pil_images, process_res=504, runs=3): |
|
|
"""Benchmark upstream/vanilla depth-anything-3.""" |
|
|
|
|
|
|
|
|
clear_modules() |
|
|
upstream_src = os.path.join(UPSTREAM_REPO, "src") |
|
|
if upstream_src in sys.path: |
|
|
sys.path.remove(upstream_src) |
|
|
sys.path.insert(0, upstream_src) |
|
|
|
|
|
with suppress_output(): |
|
|
from depth_anything_3.api import DepthAnything3 |
|
|
suppress_logging() |
|
|
|
|
|
cleanup() |
|
|
|
|
|
|
|
|
start = time.perf_counter() |
|
|
model = DepthAnything3(model_name=MODEL_NAME) |
|
|
model = model.to(device) |
|
|
model.eval() |
|
|
cold_load_time = time.perf_counter() - start |
|
|
|
|
|
|
|
|
for _ in range(2): |
|
|
model.inference(pil_images[:1], process_res=process_res) |
|
|
sync_device(device) |
|
|
cleanup() |
|
|
|
|
|
|
|
|
times = [] |
|
|
for _ in range(runs): |
|
|
cleanup() |
|
|
sync_device(device) |
|
|
start = time.perf_counter() |
|
|
model.inference(pil_images, process_res=process_res) |
|
|
sync_device(device) |
|
|
times.append(time.perf_counter() - start) |
|
|
|
|
|
avg_time = np.mean(times) |
|
|
std_time = np.std(times) |
|
|
throughput = len(pil_images) / avg_time |
|
|
|
|
|
del model |
|
|
cleanup() |
|
|
|
|
|
|
|
|
sys.path.remove(upstream_src) |
|
|
clear_modules() |
|
|
|
|
|
return { |
|
|
"cold_load": cold_load_time, |
|
|
"inference_time": avg_time, |
|
|
"inference_std": std_time, |
|
|
"throughput": throughput, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def benchmark_awesome(device, pil_images, process_res=504, runs=3, use_cache=True): |
|
|
"""Benchmark awesome (optimized) depth-anything-3.""" |
|
|
|
|
|
|
|
|
clear_modules() |
|
|
awesome_src = os.path.join(AWESOME_REPO, "src") |
|
|
if awesome_src in sys.path: |
|
|
sys.path.remove(awesome_src) |
|
|
sys.path.insert(0, awesome_src) |
|
|
|
|
|
with suppress_output(): |
|
|
from depth_anything_3.api import DepthAnything3 |
|
|
from depth_anything_3.cache import get_model_cache |
|
|
suppress_logging() |
|
|
|
|
|
|
|
|
if not use_cache: |
|
|
cache = get_model_cache() |
|
|
cache.clear() |
|
|
|
|
|
cleanup() |
|
|
|
|
|
|
|
|
start = time.perf_counter() |
|
|
model = DepthAnything3(model_name=MODEL_NAME, device=device, use_cache=use_cache) |
|
|
load_time = time.perf_counter() - start |
|
|
|
|
|
|
|
|
cached_load_time = None |
|
|
if use_cache: |
|
|
del model |
|
|
cleanup() |
|
|
start = time.perf_counter() |
|
|
model = DepthAnything3(model_name=MODEL_NAME, device=device, use_cache=True) |
|
|
cached_load_time = time.perf_counter() - start |
|
|
|
|
|
|
|
|
for _ in range(2): |
|
|
model.inference(pil_images[:1], process_res=process_res) |
|
|
sync_device(device) |
|
|
cleanup() |
|
|
|
|
|
|
|
|
times = [] |
|
|
for _ in range(runs): |
|
|
cleanup() |
|
|
sync_device(device) |
|
|
start = time.perf_counter() |
|
|
model.inference(pil_images, process_res=process_res) |
|
|
sync_device(device) |
|
|
times.append(time.perf_counter() - start) |
|
|
|
|
|
avg_time = np.mean(times) |
|
|
std_time = np.std(times) |
|
|
throughput = len(pil_images) / avg_time |
|
|
|
|
|
del model |
|
|
cleanup() |
|
|
|
|
|
|
|
|
sys.path.remove(awesome_src) |
|
|
clear_modules() |
|
|
|
|
|
return { |
|
|
"cold_load": load_time, |
|
|
"cached_load": cached_load_time, |
|
|
"inference_time": avg_time, |
|
|
"inference_std": std_time, |
|
|
"throughput": throughput, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_comparison(device, batch_sizes, process_res=504, runs=3): |
|
|
"""Run comparison for a specific device.""" |
|
|
|
|
|
results = {} |
|
|
temp_dir = "temp_compare" |
|
|
os.makedirs(temp_dir, exist_ok=True) |
|
|
|
|
|
try: |
|
|
|
|
|
max_batch = max(batch_sizes) |
|
|
pil_images = [] |
|
|
for i in range(max_batch): |
|
|
img = Image.new("RGB", (1280, 720), color=(100 + i*10, 150, 200)) |
|
|
pil_images.append(img) |
|
|
|
|
|
for batch_size in batch_sizes: |
|
|
test_images = pil_images[:batch_size] |
|
|
results[batch_size] = {} |
|
|
|
|
|
print(f"\n Batch size: {batch_size}") |
|
|
print(f" {'-'*50}") |
|
|
|
|
|
|
|
|
print(f" Testing UPSTREAM (vanilla)...", end=" ", flush=True) |
|
|
try: |
|
|
upstream = benchmark_upstream(device, test_images, process_res, runs) |
|
|
results[batch_size]["upstream"] = upstream |
|
|
print(f"{upstream['throughput']:.2f} img/s") |
|
|
except Exception as e: |
|
|
print(f"ERROR: {e}") |
|
|
results[batch_size]["upstream"] = None |
|
|
|
|
|
|
|
|
print(f" Testing AWESOME (no cache)...", end=" ", flush=True) |
|
|
try: |
|
|
awesome_nc = benchmark_awesome(device, test_images, process_res, runs, use_cache=False) |
|
|
results[batch_size]["awesome_nocache"] = awesome_nc |
|
|
print(f"{awesome_nc['throughput']:.2f} img/s") |
|
|
except Exception as e: |
|
|
print(f"ERROR: {e}") |
|
|
results[batch_size]["awesome_nocache"] = None |
|
|
|
|
|
|
|
|
print(f" Testing AWESOME (cached)...", end=" ", flush=True) |
|
|
try: |
|
|
awesome_c = benchmark_awesome(device, test_images, process_res, runs, use_cache=True) |
|
|
results[batch_size]["awesome_cached"] = awesome_c |
|
|
print(f"{awesome_c['throughput']:.2f} img/s") |
|
|
except Exception as e: |
|
|
print(f"ERROR: {e}") |
|
|
results[batch_size]["awesome_cached"] = None |
|
|
|
|
|
finally: |
|
|
shutil.rmtree(temp_dir, ignore_errors=True) |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def print_results_table(results, device): |
|
|
"""Print formatted results table.""" |
|
|
|
|
|
print(f"\n{'='*70}") |
|
|
print(f" RESULTS: {device.type.upper()}") |
|
|
print(f"{'='*70}") |
|
|
|
|
|
|
|
|
print(f"\n{'Batch':<8} {'Metric':<18} {'Upstream':<12} {'Awesome':<12} {'Speedup':<10}") |
|
|
print("-" * 60) |
|
|
|
|
|
for batch_size, data in sorted(results.items()): |
|
|
upstream = data.get("upstream") |
|
|
awesome = data.get("awesome_nocache") or data.get("awesome_cached") |
|
|
|
|
|
if not upstream or not awesome: |
|
|
continue |
|
|
|
|
|
|
|
|
u_thr = upstream["throughput"] |
|
|
a_thr = awesome["throughput"] |
|
|
speedup = a_thr / u_thr if u_thr > 0 else 0 |
|
|
print(f"{batch_size:<8} {'Throughput (img/s)':<18} {u_thr:<12.2f} {a_thr:<12.2f} {speedup:<10.2f}x") |
|
|
|
|
|
|
|
|
u_time = upstream["inference_time"] * 1000 |
|
|
a_time = awesome["inference_time"] * 1000 |
|
|
speedup = u_time / a_time if a_time > 0 else 0 |
|
|
print(f"{'':<8} {'Latency (ms)':<18} {u_time:<12.1f} {a_time:<12.1f} {speedup:<10.2f}x") |
|
|
|
|
|
|
|
|
u_load = upstream["cold_load"] |
|
|
a_load = awesome["cold_load"] |
|
|
speedup = u_load / a_load if a_load > 0 else 0 |
|
|
print(f"{'':<8} {'Cold load (s)':<18} {u_load:<12.2f} {a_load:<12.2f} {speedup:<10.2f}x") |
|
|
|
|
|
|
|
|
cached = data.get("awesome_cached") |
|
|
if cached and cached.get("cached_load"): |
|
|
c_load = cached["cached_load"] |
|
|
speedup = u_load / c_load if c_load > 0 else 0 |
|
|
print(f"{'':<8} {'Cached load (s)':<18} {'-':<12} {c_load:<12.3f} {speedup:<10.1f}x") |
|
|
|
|
|
print() |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Comparative Benchmark: Awesome vs Upstream") |
|
|
parser.add_argument("--device", "-d", type=str, default="auto", |
|
|
choices=["auto", "cpu", "mps", "cuda", "all"], |
|
|
help="Device to benchmark") |
|
|
parser.add_argument("--batch-sizes", type=int, nargs="+", default=[1, 2, 4], |
|
|
help="Batch sizes to test") |
|
|
parser.add_argument("--runs", type=int, default=3, help="Number of runs per test") |
|
|
parser.add_argument("--quick", action="store_true", help="Quick mode (fewer runs)") |
|
|
args = parser.parse_args() |
|
|
|
|
|
if args.quick: |
|
|
args.batch_sizes = [1, 2] |
|
|
args.runs = 2 |
|
|
|
|
|
|
|
|
available = get_available_devices() |
|
|
if args.device == "auto": |
|
|
devices = [available[-1]] |
|
|
elif args.device == "all": |
|
|
devices = available |
|
|
else: |
|
|
requested = torch.device(args.device) |
|
|
if requested in available: |
|
|
devices = [requested] |
|
|
else: |
|
|
print(f"Device '{args.device}' not available. Available: {[d.type for d in available]}") |
|
|
return |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print(" COMPARATIVE BENCHMARK: AWESOME vs UPSTREAM (VANILLA)") |
|
|
print("=" * 70) |
|
|
print(f" Model: {MODEL_NAME}") |
|
|
print(f" PyTorch: {torch.__version__}") |
|
|
print(f" Batch sizes: {args.batch_sizes}") |
|
|
print(f" Runs per test: {args.runs}") |
|
|
print(f" Devices: {[d.type.upper() for d in devices]}") |
|
|
for d in available: |
|
|
status = "β" if d in devices else "β" |
|
|
print(f" {status} {d.type.upper()}: {get_device_name(d)}") |
|
|
print("=" * 70) |
|
|
|
|
|
all_results = {} |
|
|
|
|
|
for device in devices: |
|
|
print(f"\n{'#'*70}") |
|
|
print(f" DEVICE: {device.type.upper()} ({get_device_name(device)})") |
|
|
print(f"{'#'*70}") |
|
|
|
|
|
results = run_comparison(device, args.batch_sizes, runs=args.runs) |
|
|
all_results[device.type] = results |
|
|
print_results_table(results, device) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print(" SUMMARY") |
|
|
print("=" * 70) |
|
|
|
|
|
for device_type, results in all_results.items(): |
|
|
print(f"\n {device_type.upper()}:") |
|
|
|
|
|
for batch_size, data in sorted(results.items()): |
|
|
upstream = data.get("upstream") |
|
|
awesome = data.get("awesome_nocache") |
|
|
|
|
|
if upstream and awesome: |
|
|
speedup = awesome["throughput"] / upstream["throughput"] |
|
|
print(f" Batch {batch_size}: {speedup:.2f}x faster inference") |
|
|
|
|
|
print("\n" + "=" * 70 + "\n") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|