depth-anything-3

Sleeping

File size: 5,629 Bytes

# app.py (safe CPU startup for HF Spaces)
import os
import io
import numpy as np
import torch
from PIL import Image
import gradio as gr

# Import the CPU-patched class you added earlier
from depth_anything_3.api import DepthAnything3

# ---------------------------
# Configuration
# ---------------------------
# Keep the same model path you used earlier (default is the one in your logs)
MODEL_DIR = os.environ.get("DA3_MODEL_DIR", "depth-anything/DA3NESTED-GIANT-LARGE")

# Lower processing resolution to make CPU inference feasible.
# Increase if you want better quality but expect it to be much slower.
PROCESS_RES = int(os.environ.get("DA3_PROCESS_RES", "384"))

# ---------------------------
# Model loading (CPU)
# ---------------------------
print(f"🔄 Loading DepthAnything3 from '{MODEL_DIR}' on CPU (this may take a moment)...")
# Uses the PyTorchModelHubMixin.from_pretrained you have in the class
model = DepthAnything3.from_pretrained(MODEL_DIR)
model.to(torch.device("cpu"))
model.eval()
print("✅ Model ready on CPU")

# ---------------------------
# Inference helper
# ---------------------------
def _normalize_depth_to_uint8(depth: np.ndarray) -> np.ndarray:
    """Normalize a depth map (H,W) to uint8 grayscale for display."""
    if depth is None:
        return None
    # convert to float
    d = depth.astype(np.float32)
    # clip NaNs / infs
    d = np.nan_to_num(d, nan=0.0, posinf=0.0, neginf=0.0)
    # Normalize robustly: use 1st and 99th percentiles to avoid outliers
    vmin = np.percentile(d, 1.0)
    vmax = np.percentile(d, 99.0)
    if vmax - vmin < 1e-6:
        vmax = vmin + 1.0
    d = (d - vmin) / (vmax - vmin)
    d = np.clip(d, 0.0, 1.0)
    img = (d * 255.0).astype(np.uint8)
    return img

def run_depth(single_img: Image.Image, process_res: int = PROCESS_RES):
    """
    Run single-image depth inference with the patched DepthAnything3 API.
    Returns a grayscale PIL image visualizing depth.
    """
    if single_img is None:
        return None

    # Convert PIL to numpy (DepthAnything3 accepts PIL images)
    try:
        # Use the API's inference function; we pass a list with single image.
        # Keep other args minimal to avoid heavy processing.
        pred = model.inference(
            [single_img],
            process_res=process_res,
            process_res_method="upper_bound_resize",
            export_format="mini_npz",  # minimal export
        )
    except Exception as e:
        # If inference raises, return a helpful message image
        msg = f"Inference error: {e}"
        print(msg)
        # Make a small image with the error text
        err_img = Image.new("RGB", (640, 120), color=(255, 255, 255))
        return err_img

    # Extract depth from Prediction object - handle a few possible shapes / attrs
    depth_map = None
    # First try attribute .depth (common pattern in your code)
    if hasattr(pred, "depth"):
        depth_map = pred.depth
    elif isinstance(pred, dict) and "depth" in pred:
        depth_map = pred["depth"]
    elif hasattr(pred, "predictions") and len(pred.predictions) > 0:
        # fallback: some wrappers store lists
        depth_map = pred.predictions[0].depth if hasattr(pred.predictions[0], "depth") else None

    # depth_map might be (N,H,W) or (H,W)
    if depth_map is None:
        # fallback: try processed_images if available (visual sanity)
        try:
            if hasattr(pred, "processed_images"):
                imgs = pred.processed_images
                if isinstance(imgs, np.ndarray) and imgs.shape[0] > 0:
                    # return first processed image
                    return Image.fromarray((imgs[0] * 255).astype(np.uint8))
        except Exception:
            pass
        # nothing usable
        print("No depth found in prediction; returning empty image.")
        return Image.new("RGB", (640, 480), color=(255, 255, 255))

    # If depth_map is batched, take first
    if isinstance(depth_map, (list, tuple)):
        depth_map = depth_map[0]
    if isinstance(depth_map, np.ndarray) and depth_map.ndim == 3 and depth_map.shape[0] in (1,):
        # shape (1,H,W)
        depth_map = depth_map[0]
    if isinstance(depth_map, torch.Tensor):
        depth_map = depth_map.cpu().numpy()
    # Now depth_map should be (H,W)
    if depth_map.ndim == 3 and depth_map.shape[0] == 3:
        # if somehow 3-channel, convert to single channel by averaging
        depth_map = depth_map.mean(axis=0)

    depth_uint8 = _normalize_depth_to_uint8(depth_map)
    if depth_uint8 is None:
        return Image.new("RGB", (640, 480), color=(255, 255, 255))

    # Return grayscale PIL image
    depth_img = Image.fromarray(depth_uint8, mode="L")
    return depth_img

# ---------------------------
# Gradio interface
# ---------------------------
title = "Depth Anything 3 — CPU (single-image)"
description = (
    "CPU-only minimal interface. Upload a single image and get a quick depth visualization.\n"
    "This Space is intentionally lightweight to allow CPU startup. For better quality/multiview features you need GPU or the full app."
)

# Make the Gradio Interface the top-level `app` variable so HF Spaces detects it
app = gr.Interface(
    fn=run_depth,
    inputs=[
        gr.Image(type="pil", label="Upload image"),
        gr.Slider(minimum=128, maximum=1024, step=64, value=PROCESS_RES, label="Process resolution (smaller = faster)")
    ],
    outputs=gr.Image(label="Predicted depth (grayscale)"),
    title=title,
    description=description,
)

# For local running
if __name__ == "__main__":
    app.launch(server_name="0.0.0.0", server_port=7860)