# app.py (safe CPU startup for HF Spaces) import os import io import numpy as np import torch from PIL import Image import gradio as gr # Import the CPU-patched class you added earlier from depth_anything_3.api import DepthAnything3 # --------------------------- # Configuration # --------------------------- # Keep the same model path you used earlier (default is the one in your logs) MODEL_DIR = os.environ.get("DA3_MODEL_DIR", "depth-anything/DA3NESTED-GIANT-LARGE") # Lower processing resolution to make CPU inference feasible. # Increase if you want better quality but expect it to be much slower. PROCESS_RES = int(os.environ.get("DA3_PROCESS_RES", "384")) # --------------------------- # Model loading (CPU) # --------------------------- print(f"🔄 Loading DepthAnything3 from '{MODEL_DIR}' on CPU (this may take a moment)...") # Uses the PyTorchModelHubMixin.from_pretrained you have in the class model = DepthAnything3.from_pretrained(MODEL_DIR) model.to(torch.device("cpu")) model.eval() print("✅ Model ready on CPU") # --------------------------- # Inference helper # --------------------------- def _normalize_depth_to_uint8(depth: np.ndarray) -> np.ndarray: """Normalize a depth map (H,W) to uint8 grayscale for display.""" if depth is None: return None # convert to float d = depth.astype(np.float32) # clip NaNs / infs d = np.nan_to_num(d, nan=0.0, posinf=0.0, neginf=0.0) # Normalize robustly: use 1st and 99th percentiles to avoid outliers vmin = np.percentile(d, 1.0) vmax = np.percentile(d, 99.0) if vmax - vmin < 1e-6: vmax = vmin + 1.0 d = (d - vmin) / (vmax - vmin) d = np.clip(d, 0.0, 1.0) img = (d * 255.0).astype(np.uint8) return img def run_depth(single_img: Image.Image, process_res: int = PROCESS_RES): """ Run single-image depth inference with the patched DepthAnything3 API. Returns a grayscale PIL image visualizing depth. """ if single_img is None: return None # Convert PIL to numpy (DepthAnything3 accepts PIL images) try: # Use the API's inference function; we pass a list with single image. # Keep other args minimal to avoid heavy processing. pred = model.inference( [single_img], process_res=process_res, process_res_method="upper_bound_resize", export_format="mini_npz", # minimal export ) except Exception as e: # If inference raises, return a helpful message image msg = f"Inference error: {e}" print(msg) # Make a small image with the error text err_img = Image.new("RGB", (640, 120), color=(255, 255, 255)) return err_img # Extract depth from Prediction object - handle a few possible shapes / attrs depth_map = None # First try attribute .depth (common pattern in your code) if hasattr(pred, "depth"): depth_map = pred.depth elif isinstance(pred, dict) and "depth" in pred: depth_map = pred["depth"] elif hasattr(pred, "predictions") and len(pred.predictions) > 0: # fallback: some wrappers store lists depth_map = pred.predictions[0].depth if hasattr(pred.predictions[0], "depth") else None # depth_map might be (N,H,W) or (H,W) if depth_map is None: # fallback: try processed_images if available (visual sanity) try: if hasattr(pred, "processed_images"): imgs = pred.processed_images if isinstance(imgs, np.ndarray) and imgs.shape[0] > 0: # return first processed image return Image.fromarray((imgs[0] * 255).astype(np.uint8)) except Exception: pass # nothing usable print("No depth found in prediction; returning empty image.") return Image.new("RGB", (640, 480), color=(255, 255, 255)) # If depth_map is batched, take first if isinstance(depth_map, (list, tuple)): depth_map = depth_map[0] if isinstance(depth_map, np.ndarray) and depth_map.ndim == 3 and depth_map.shape[0] in (1,): # shape (1,H,W) depth_map = depth_map[0] if isinstance(depth_map, torch.Tensor): depth_map = depth_map.cpu().numpy() # Now depth_map should be (H,W) if depth_map.ndim == 3 and depth_map.shape[0] == 3: # if somehow 3-channel, convert to single channel by averaging depth_map = depth_map.mean(axis=0) depth_uint8 = _normalize_depth_to_uint8(depth_map) if depth_uint8 is None: return Image.new("RGB", (640, 480), color=(255, 255, 255)) # Return grayscale PIL image depth_img = Image.fromarray(depth_uint8, mode="L") return depth_img # --------------------------- # Gradio interface # --------------------------- title = "Depth Anything 3 — CPU (single-image)" description = ( "CPU-only minimal interface. Upload a single image and get a quick depth visualization.\n" "This Space is intentionally lightweight to allow CPU startup. For better quality/multiview features you need GPU or the full app." ) # Make the Gradio Interface the top-level `app` variable so HF Spaces detects it app = gr.Interface( fn=run_depth, inputs=[ gr.Image(type="pil", label="Upload image"), gr.Slider(minimum=128, maximum=1024, step=64, value=PROCESS_RES, label="Process resolution (smaller = faster)") ], outputs=gr.Image(label="Predicted depth (grayscale)"), title=title, description=description, ) # For local running if __name__ == "__main__": app.launch(server_name="0.0.0.0", server_port=7860)