Spaces:

Aedelon
/

awesome-depth-anything-3

Running

awesome-depth-anything-3 / src /depth_anything_3 /api.py

Delanoe Pirard

Deploy to HuggingFace Spaces

18b382b 9 days ago

29.5 kB

	# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Depth Anything 3 API module.

	This module provides the main API for Depth Anything 3, including model loading,
	inference, and export capabilities. It supports both single and nested model architectures.
	"""

	from __future__ import annotations

	import time
	from typing import Optional, Sequence

	import numpy as np
	import torch
	import torch.nn as nn
	from huggingface_hub import PyTorchModelHubMixin
	from PIL import Image

	from depth_anything_3.cache import get_model_cache
	from depth_anything_3.cfg import create_object, load_config
	from depth_anything_3.registry import MODEL_REGISTRY
	from depth_anything_3.specs import Prediction
	from depth_anything_3.utils.adaptive_batching import (
	AdaptiveBatchConfig,
	AdaptiveBatchSizeCalculator,
	adaptive_batch_iterator,
	estimate_max_batch_size,
	)
	from depth_anything_3.utils.export import export
	from depth_anything_3.utils.geometry import affine_inverse
	from depth_anything_3.utils.io.gpu_input_processor import GPUInputProcessor
	from depth_anything_3.utils.io.input_processor import InputProcessor
	from depth_anything_3.utils.io.output_processor import OutputProcessor
	from depth_anything_3.utils.logger import logger
	from depth_anything_3.utils.pose_align import align_poses_umeyama

	torch.backends.cudnn.benchmark = False
	# logger.info("CUDNN Benchmark Disabled")

	SAFETENSORS_NAME = "model.safetensors"
	CONFIG_NAME = "config.json"


	class DepthAnything3(nn.Module, PyTorchModelHubMixin):
	"""
	Depth Anything 3 main API class.

	This class provides a high-level interface for depth estimation using Depth Anything 3.
	It supports both single and nested model architectures with metric scaling capabilities.

	Features:
	- Hugging Face Hub integration via PyTorchModelHubMixin
	- Support for multiple model presets (vitb, vitg, nested variants)
	- Automatic mixed precision inference
	- Export capabilities for various formats (GLB, PLY, NPZ, etc.)
	- Camera pose estimation and metric depth scaling

	Usage:
	# Load from Hugging Face Hub
	model = DepthAnything3.from_pretrained("huggingface/model-name")

	# Or create with specific preset
	model = DepthAnything3(preset="vitg")

	# Run inference
	prediction = model.inference(images, export_dir="output", export_format="glb")
	"""

	_commit_hash: str \| None = None # Set by mixin when loading from Hub

	def __init__(self, model_name: str = "da3-large", device: str \| torch.device \| None = None, use_cache: bool = True, **kwargs):
	"""
	Initialize DepthAnything3 with specified preset.

	Args:
	model_name: The name of the model preset to use.
	Examples: 'da3-giant', 'da3-large', 'da3metric-large', 'da3nested-giant-large'.
	device: Target device ('cuda', 'mps', 'cpu'). If None, auto-detect.
	use_cache: Whether to use model caching (default: True).
	Set to False to force reload model from disk.
	**kwargs: Additional keyword arguments (currently unused).
	"""
	super().__init__()
	self.model_name = model_name
	self.use_cache = use_cache

	# Determine device
	if device is None:
	device = self._auto_detect_device()
	self.device = torch.device(device) if isinstance(device, str) else device

	# Load model configuration
	self.config = load_config(MODEL_REGISTRY[self.model_name])

	# Build or retrieve model from cache
	if use_cache:
	cache = get_model_cache()
	self.model = cache.get(
	model_name=self.model_name,
	device=self.device,
	loader_fn=lambda: self._create_model()
	)
	else:
	logger.info(f"Model cache disabled, loading {self.model_name} from disk")
	self.model = self._create_model()

	# Ensure model is on correct device and in eval mode
	self.model = self.model.to(self.device)
	self.model.eval()

	# Initialize processors
	# Use GPUInputProcessor for CUDA/MPS devices to enable GPU ops
	# Note: NVJPEG decoding is specific to CUDA, MPS will use optimized CPU decoding + GPU resize
	if self.device.type in ("cuda", "mps"):
	self.input_processor = GPUInputProcessor(device=self.device)
	decoding_info = "NVJPEG support enabled" if self.device.type == "cuda" else "TorchVision decoding"
	logger.info(f"Using GPUInputProcessor ({decoding_info} on {self.device})")
	else:
	self.input_processor = InputProcessor()
	logger.info("Using standard InputProcessor (optimized CPU pipeline)")

	self.output_processor = OutputProcessor()

	def _auto_detect_device(self) -> torch.device:
	"""Auto-detect best available device."""
	if torch.cuda.is_available():
	return torch.device("cuda")
	elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
	return torch.device("mps")
	else:
	return torch.device("cpu")

	def _create_model(self) -> nn.Module:
	"""Create and return new model instance on correct device."""
	model = create_object(self.config)
	model = model.to(self.device) # Move to device before caching
	model.eval()
	return model

	@torch.inference_mode()
	def forward(
	self,
	image: torch.Tensor,
	extrinsics: torch.Tensor \| None = None,
	intrinsics: torch.Tensor \| None = None,
	export_feat_layers: list[int] \| None = None,
	infer_gs: bool = False,
	use_ray_pose: bool = False,
	ref_view_strategy: str = "saddle_balanced",
	) -> dict[str, torch.Tensor]:
	"""
	Forward pass through the model.

	Args:
	image: Input batch with shape ``(B, N, 3, H, W)`` on the model device.
	extrinsics: Optional camera extrinsics with shape ``(B, N, 4, 4)``.
	intrinsics: Optional camera intrinsics with shape ``(B, N, 3, 3)``.
	export_feat_layers: Layer indices to return intermediate features for.
	infer_gs: Enable Gaussian Splatting branch.
	use_ray_pose: Use ray-based pose estimation instead of camera decoder.
	ref_view_strategy: Strategy for selecting reference view from multiple views.

	Returns:
	Dictionary containing model predictions
	"""
	with torch.no_grad():
	# MPS doesn't support autocast well - use float32 for stability
	if image.device.type == "mps":
	return self.model(
	image, extrinsics, intrinsics, export_feat_layers, infer_gs, use_ray_pose, ref_view_strategy
	)
	else:
	# CUDA: use autocast for performance
	autocast_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
	with torch.autocast(device_type=image.device.type, dtype=autocast_dtype):
	return self.model(
	image, extrinsics, intrinsics, export_feat_layers, infer_gs, use_ray_pose, ref_view_strategy
	)

	def inference(
	self,
	image: list[np.ndarray \| Image.Image \| str],
	extrinsics: np.ndarray \| None = None,
	intrinsics: np.ndarray \| None = None,
	align_to_input_ext_scale: bool = True,
	infer_gs: bool = False,
	use_ray_pose: bool = False,
	ref_view_strategy: str = "saddle_balanced",
	render_exts: np.ndarray \| None = None,
	render_ixts: np.ndarray \| None = None,
	render_hw: tuple[int, int] \| None = None,
	process_res: int = 504,
	process_res_method: str = "upper_bound_resize",
	export_dir: str \| None = None,
	export_format: str = "mini_npz",
	export_feat_layers: Sequence[int] \| None = None,
	# GLB export parameters
	conf_thresh_percentile: float = 40.0,
	num_max_points: int = 1_000_000,
	show_cameras: bool = True,
	# Feat_vis export parameters
	feat_vis_fps: int = 15,
	# Other export parameters, e.g., gs_ply, gs_video
	export_kwargs: Optional[dict] = {},
	) -> Prediction:
	"""
	Run inference on input images.

	Args:
	image: List of input images (numpy arrays, PIL Images, or file paths)
	extrinsics: Camera extrinsics (N, 4, 4)
	intrinsics: Camera intrinsics (N, 3, 3)
	align_to_input_ext_scale: whether to align the input pose scale to the prediction
	infer_gs: Enable the 3D Gaussian branch (needed for `gs_ply`/`gs_video` exports)
	use_ray_pose: Use ray-based pose estimation instead of camera decoder (default: False)
	ref_view_strategy: Strategy for selecting reference view from multiple views.
	Options: "first", "middle", "saddle_balanced", "saddle_sim_range".
	Default: "saddle_balanced". For single view input (S ≤ 2), no reordering is performed.
	render_exts: Optional render extrinsics for Gaussian video export
	render_ixts: Optional render intrinsics for Gaussian video export
	render_hw: Optional render resolution for Gaussian video export
	process_res: Processing resolution
	process_res_method: Resize method for processing
	export_dir: Directory to export results
	export_format: Export format (mini_npz, npz, glb, ply, gs, gs_video)
	export_feat_layers: Layer indices to export intermediate features from
	conf_thresh_percentile: [GLB] Lower percentile for adaptive confidence threshold (default: 40.0) # noqa: E501
	num_max_points: [GLB] Maximum number of points in the point cloud (default: 1,000,000)
	show_cameras: [GLB] Show camera wireframes in the exported scene (default: True)
	feat_vis_fps: [FEAT_VIS] Frame rate for output video (default: 15)
	export_kwargs: additional arguments to export functions.

	Returns:
	Prediction object containing depth maps and camera parameters
	"""
	if "gs" in export_format:
	assert infer_gs, "must set `infer_gs=True` to perform gs-related export."

	if "colmap" in export_format:
	assert isinstance(image[0], str), "`image` must be image paths for COLMAP export."

	# Preprocess images
	imgs_cpu, extrinsics, intrinsics = self._preprocess_inputs(
	image, extrinsics, intrinsics, process_res, process_res_method
	)

	# Prepare tensors for model
	imgs, ex_t, in_t = self._prepare_model_inputs(imgs_cpu, extrinsics, intrinsics)

	# Normalize extrinsics
	ex_t_norm = self._normalize_extrinsics(ex_t.clone() if ex_t is not None else None)

	# Run model forward pass
	export_feat_layers = list(export_feat_layers) if export_feat_layers is not None else []

	raw_output = self._run_model_forward(
	imgs, ex_t_norm, in_t, export_feat_layers, infer_gs, use_ray_pose, ref_view_strategy
	)

	# Convert raw output to prediction
	prediction = self._convert_to_prediction(raw_output)

	# Align prediction to extrinsincs
	prediction = self._align_to_input_extrinsics_intrinsics(
	extrinsics, intrinsics, prediction, align_to_input_ext_scale
	)

	# Add processed images for visualization
	prediction = self._add_processed_images(prediction, imgs_cpu)

	# Export if requested
	if export_dir is not None:

	if "gs" in export_format:
	if infer_gs and "gs_video" not in export_format:
	export_format = f"{export_format}-gs_video"
	if "gs_video" in export_format:
	if "gs_video" not in export_kwargs:
	export_kwargs["gs_video"] = {}
	export_kwargs["gs_video"].update(
	{
	"extrinsics": render_exts,
	"intrinsics": render_ixts,
	"out_image_hw": render_hw,
	}
	)
	# Add GLB export parameters
	if "glb" in export_format:
	if "glb" not in export_kwargs:
	export_kwargs["glb"] = {}
	export_kwargs["glb"].update(
	{
	"conf_thresh_percentile": conf_thresh_percentile,
	"num_max_points": num_max_points,
	"show_cameras": show_cameras,
	}
	)
	# Add Feat_vis export parameters
	if "feat_vis" in export_format:
	if "feat_vis" not in export_kwargs:
	export_kwargs["feat_vis"] = {}
	export_kwargs["feat_vis"].update(
	{
	"fps": feat_vis_fps,
	}
	)
	# Add COLMAP export parameters
	if "colmap" in export_format:
	if "colmap" not in export_kwargs:
	export_kwargs["colmap"] = {}
	export_kwargs["colmap"].update(
	{
	"image_paths": image,
	"conf_thresh_percentile": conf_thresh_percentile,
	"process_res_method": process_res_method,
	}
	)
	self._export_results(prediction, export_format, export_dir, **export_kwargs)

	return prediction

	def _preprocess_inputs(
	self,
	image: list[np.ndarray \| Image.Image \| str],
	extrinsics: np.ndarray \| None = None,
	intrinsics: np.ndarray \| None = None,
	process_res: int = 504,
	process_res_method: str = "upper_bound_resize",
	) -> tuple[torch.Tensor, torch.Tensor \| None, torch.Tensor \| None]:
	"""Preprocess input images using input processor."""
	start_time = time.time()

	# Determine normalization strategy:
	# 1. Hybrid (CPU Proc + GPU Device): Skip CPU norm (return uint8), norm on GPU later.
	# 2. GPU Proc (NVJPEG/Kornia): Perform norm on GPU immediately.
	# 3. Standard CPU: Perform norm on CPU.

	perform_norm = True
	if self.device.type in ("cuda", "mps") and not isinstance(self.input_processor, GPUInputProcessor):
	perform_norm = False

	imgs_cpu, extrinsics, intrinsics = self.input_processor(
	image,
	extrinsics.copy() if extrinsics is not None else None,
	intrinsics.copy() if intrinsics is not None else None,
	process_res,
	process_res_method,
	perform_normalization=perform_norm,
	)
	end_time = time.time()
	logger.info(
	"Processed Images Done taking",
	end_time - start_time,
	"seconds. Shape: ",
	imgs_cpu.shape,
	)
	return imgs_cpu, extrinsics, intrinsics

	def _prepare_model_inputs(
	self,
	imgs_cpu: torch.Tensor,
	extrinsics: torch.Tensor \| None,
	intrinsics: torch.Tensor \| None,
	) -> tuple[torch.Tensor, torch.Tensor \| None, torch.Tensor \| None]:
	"""
	Prepare tensors for model input with optimized device transfer.
	"""
	device = self._get_model_device()

	# 1. Handle Image Tensor
	# Compare device types (handles cuda:0 vs cuda comparison)
	imgs_on_target_device = (imgs_cpu.device.type == device.type)
	if imgs_on_target_device:
	# Case A: Already on correct device (GPUInputProcessor)
	# Ensure correct shape: (B, S, C, H, W) where B=1
	imgs = imgs_cpu
	if imgs.dim() == 3:
	# Single image (C, H, W) -> (1, 1, C, H, W)
	imgs = imgs.unsqueeze(0).unsqueeze(0)
	elif imgs.dim() == 4:
	# Batch of images (N, C, H, W) -> (1, N, C, H, W)
	imgs = imgs.unsqueeze(0)
	# dim() == 5 means already correct shape
	if imgs.dtype == torch.uint8:
	# Should not happen with GPUInputProcessor default, but safety fallback
	imgs = imgs.float() / 255.0
	imgs = InputProcessor.normalize_tensor(
	imgs,
	mean=[0.485, 0.456, 0.406],
	std=[0.229, 0.224, 0.225]
	)
	else:
	# Case B & C: Needs transfer from CPU
	if imgs_cpu.dtype == torch.uint8:
	# Hybrid mode: uint8 -> GPU -> float -> normalize
	if device.type == "cuda":
	imgs_cpu = imgs_cpu.pin_memory()

	imgs = imgs_cpu.to(device, non_blocking=True).float() / 255.0
	imgs = InputProcessor.normalize_tensor(
	imgs,
	mean=[0.485, 0.456, 0.406],
	std=[0.229, 0.224, 0.225]
	)
	imgs = imgs[None] # Add batch dimension (1, N, 3, H, W)
	else:
	# Standard mode: float -> GPU
	if device.type == "cuda":
	imgs_cpu = imgs_cpu.pin_memory()
	imgs = imgs_cpu.to(device, non_blocking=True)[None].float()

	# Convert camera parameters to tensors with non-blocking transfer
	ex_t = (
	extrinsics.pin_memory().to(device, non_blocking=True)[None].float()
	if extrinsics is not None and device.type == "cuda" and extrinsics.device.type == "cpu"
	else extrinsics.to(device, non_blocking=True)[None].float()
	if extrinsics is not None and extrinsics.device != device
	else extrinsics[None].float()
	if extrinsics is not None
	else None
	)
	in_t = (
	intrinsics.pin_memory().to(device, non_blocking=True)[None].float()
	if intrinsics is not None and device.type == "cuda" and intrinsics.device.type == "cpu"
	else intrinsics.to(device, non_blocking=True)[None].float()
	if intrinsics is not None and intrinsics.device != device
	else intrinsics[None].float()
	if intrinsics is not None
	else None
	)

	return imgs, ex_t, in_t

	def _normalize_extrinsics(self, ex_t: torch.Tensor \| None) -> torch.Tensor \| None:
	"""Normalize extrinsics"""
	if ex_t is None:
	return None
	transform = affine_inverse(ex_t[:, :1])
	ex_t_norm = ex_t @ transform
	c2ws = affine_inverse(ex_t_norm)
	translations = c2ws[..., :3, 3]
	dists = translations.norm(dim=-1)
	median_dist = torch.median(dists)
	median_dist = torch.clamp(median_dist, min=1e-1)
	ex_t_norm[..., :3, 3] = ex_t_norm[..., :3, 3] / median_dist
	return ex_t_norm

	def _align_to_input_extrinsics_intrinsics(
	self,
	extrinsics: torch.Tensor \| None,
	intrinsics: torch.Tensor \| None,
	prediction: Prediction,
	align_to_input_ext_scale: bool = True,
	ransac_view_thresh: int = 10,
	) -> Prediction:
	"""Align depth map to input extrinsics"""
	if extrinsics is None:
	return prediction
	prediction.intrinsics = intrinsics.numpy()
	_, _, scale, aligned_extrinsics = align_poses_umeyama(
	prediction.extrinsics,
	extrinsics.numpy(),
	ransac=len(extrinsics) >= ransac_view_thresh,
	return_aligned=True,
	random_state=42,
	)
	if align_to_input_ext_scale:
	prediction.extrinsics = extrinsics[..., :3, :].numpy()
	prediction.depth /= scale
	else:
	prediction.extrinsics = aligned_extrinsics
	return prediction

	def _run_model_forward(
	self,
	imgs: torch.Tensor,
	ex_t: torch.Tensor \| None,
	in_t: torch.Tensor \| None,
	export_feat_layers: Sequence[int] \| None = None,
	infer_gs: bool = False,
	use_ray_pose: bool = False,
	ref_view_strategy: str = "saddle_balanced",
	) -> dict[str, torch.Tensor]:
	"""Run model forward pass."""
	device = imgs.device
	need_sync = device.type == "cuda"
	if need_sync:
	torch.cuda.synchronize(device)
	start_time = time.time()
	feat_layers = list(export_feat_layers) if export_feat_layers is not None else None
	output = self.forward(imgs, ex_t, in_t, feat_layers, infer_gs, use_ray_pose, ref_view_strategy)
	if need_sync:
	torch.cuda.synchronize(device)
	end_time = time.time()
	logger.info(f"Model Forward Pass Done. Time: {end_time - start_time} seconds")
	return output

	def _convert_to_prediction(self, raw_output: dict[str, torch.Tensor]) -> Prediction:
	"""Convert raw model output to Prediction object."""
	start_time = time.time()
	output = self.output_processor(raw_output)
	end_time = time.time()
	logger.info(f"Conversion to Prediction Done. Time: {end_time - start_time} seconds")
	return output

	def _add_processed_images(self, prediction: Prediction, imgs_cpu: torch.Tensor) -> Prediction:
	"""Add processed images to prediction for visualization."""
	# Convert from (N, 3, H, W) to (N, H, W, 3)
	processed_imgs = imgs_cpu.permute(0, 2, 3, 1).cpu().numpy() # (N, H, W, 3)

	if imgs_cpu.dtype == torch.uint8:
	# Already uint8, no need to denormalize
	pass
	else:
	# Denormalize from ImageNet normalization
	mean = np.array([0.485, 0.456, 0.406])
	std = np.array([0.229, 0.224, 0.225])
	processed_imgs = processed_imgs * std + mean
	processed_imgs = np.clip(processed_imgs, 0, 1)
	processed_imgs = (processed_imgs * 255).astype(np.uint8)

	prediction.processed_images = processed_imgs
	return prediction

	def _export_results(
	self, prediction: Prediction, export_format: str, export_dir: str, **kwargs
	) -> None:
	"""Export results to specified format and directory."""
	start_time = time.time()
	export(prediction, export_format, export_dir, **kwargs)
	end_time = time.time()
	logger.info(f"Export Results Done. Time: {end_time - start_time} seconds")

	def _get_model_device(self) -> torch.device:
	"""
	Get the device where the model is located.

	Returns:
	Device where the model parameters are located

	Raises:
	ValueError: If no tensors are found in the model
	"""
	if self.device is not None:
	return self.device

	# Find device from parameters
	for param in self.parameters():
	self.device = param.device
	return param.device

	# Find device from buffers
	for buffer in self.buffers():
	self.device = buffer.device
	return buffer.device

	raise ValueError("No tensor found in model")

	# =========================================================================
	# Adaptive Batching Methods
	# =========================================================================

	def batch_inference(
	self,
	images: list[np.ndarray \| Image.Image \| str],
	process_res: int = 504,
	batch_size: int \| str = "auto",
	max_batch_size: int = 64,
	target_memory_utilization: float = 0.85,
	progress_callback: callable \| None = None,
	) -> list[Prediction]:
	"""
	Run inference on multiple images with adaptive batching.

	This method automatically determines optimal batch sizes based on
	available GPU memory, maximizing throughput while preventing OOM errors.

	Args:
	images: List of input images (numpy arrays, PIL Images, or file paths)
	process_res: Processing resolution (default: 504)
	batch_size: Batch size or "auto" for adaptive batching (default: "auto")
	max_batch_size: Maximum batch size when using adaptive batching (default: 64)
	target_memory_utilization: Target GPU memory usage 0.0-1.0 (default: 0.85)
	progress_callback: Optional callback(processed, total) for progress updates

	Returns:
	List of Prediction objects, one per batch

	Example:
	>>> model = DepthAnything3(model_name="da3-large")
	>>> images = ["img1.jpg", "img2.jpg", ..., "img100.jpg"]
	>>>
	>>> # Adaptive batching (recommended)
	>>> results = model.batch_inference(images, process_res=518)
	>>>
	>>> # Fixed batch size
	>>> results = model.batch_inference(images, batch_size=4)
	>>>
	>>> # With progress callback
	>>> def on_progress(done, total):
	... print(f"Processed {done}/{total}")
	>>> results = model.batch_inference(images, progress_callback=on_progress)
	"""
	import gc

	num_images = len(images)
	if num_images == 0:
	return []

	results: list[Prediction] = []

	# Determine batch size
	if batch_size == "auto":
	config = AdaptiveBatchConfig(
	max_batch_size=max_batch_size,
	target_memory_utilization=target_memory_utilization,
	)
	calculator = AdaptiveBatchSizeCalculator(
	model_name=self.model_name,
	device=self.device,
	config=config,
	)

	for batch_info in adaptive_batch_iterator(images, calculator, process_res):
	# Run inference on this batch
	prediction = self.inference(
	image=batch_info.items,
	process_res=process_res,
	)
	results.append(prediction)

	# Progress callback
	if progress_callback:
	progress_callback(batch_info.end_idx, num_images)

	# Memory cleanup between batches
	if not batch_info.is_last:
	gc.collect()
	if self.device.type == "cuda":
	torch.cuda.empty_cache()
	elif self.device.type == "mps":
	torch.mps.empty_cache()

	# Update profiling data for better estimates
	if calculator.config.enable_profiling and self.device.type == "cuda":
	memory_used = torch.cuda.max_memory_allocated(self.device) / (1024 * 1024)
	calculator.update_from_profiling(
	batch_size=batch_info.batch_size,
	memory_used_mb=memory_used,
	process_res=process_res,
	)
	torch.cuda.reset_peak_memory_stats(self.device)

	else:
	# Fixed batch size
	fixed_batch_size = int(batch_size)
	for i in range(0, num_images, fixed_batch_size):
	end_idx = min(i + fixed_batch_size, num_images)
	batch_images = images[i:end_idx]

	prediction = self.inference(
	image=batch_images,
	process_res=process_res,
	)
	results.append(prediction)

	if progress_callback:
	progress_callback(end_idx, num_images)

	# Memory cleanup
	if end_idx < num_images:
	gc.collect()
	if self.device.type == "cuda":
	torch.cuda.empty_cache()
	elif self.device.type == "mps":
	torch.mps.empty_cache()

	return results

	def get_optimal_batch_size(
	self,
	process_res: int = 504,
	target_utilization: float = 0.85,
	) -> int:
	"""
	Get the optimal batch size for current GPU memory state.

	Args:
	process_res: Processing resolution (default: 504)
	target_utilization: Target GPU memory usage 0.0-1.0 (default: 0.85)

	Returns:
	Recommended batch size

	Example:
	>>> model = DepthAnything3(model_name="da3-large")
	>>> batch_size = model.get_optimal_batch_size(process_res=518)
	>>> print(f"Optimal batch size: {batch_size}")
	"""
	return estimate_max_batch_size(
	model_name=self.model_name,
	device=self.device,
	process_res=process_res,
	target_utilization=target_utilization,
	)