Spaces:

dimdimz
/

DimensioDepth

Sleeping

File size: 4,824 Bytes

191a797

"""
Real AI Depth Estimation using Hugging Face Transformers
Uses Depth-Anything V2 directly (no ONNX conversion needed!)
"""

import numpy as np
import torch
from PIL import Image
from transformers import AutoImageProcessor, AutoModelForDepthEstimation

class TransformersDepthEstimator:
    """
    Depth estimation using Hugging Face Transformers
    Easier than ONNX - works directly with PyTorch models!
    """

    def __init__(self, model_size="small", device=None, cache_dir=None):
        """
        Initialize depth estimator

        Args:
            model_size: "small", "base", or "large"
            device: "cuda", "cpu", or None (auto-detect)
            cache_dir: Where to cache models (default: project folder)
        """
        self.model_size = model_size

        # Auto-detect device if not specified
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device

        # Set cache directory to project folder
        if cache_dir is None:
            from pathlib import Path
            cache_dir = Path(__file__).parent.parent / "models" / "cache" / "huggingface"
            cache_dir.mkdir(parents=True, exist_ok=True)
            cache_dir = str(cache_dir)

        print(f"[*] Loading Depth-Anything V2 {model_size.upper()} model...")
        print(f"[*] Device: {self.device.upper()}")
        print(f"[*] Cache dir: {cache_dir}")

        # Model repository mapping
        model_map = {
            "small": "depth-anything/Depth-Anything-V2-Small-hf",
            "base": "depth-anything/Depth-Anything-V2-Base-hf",
            "large": "depth-anything/Depth-Anything-V2-Large-hf"
        }

        if model_size not in model_map:
            raise ValueError(f"Invalid model_size. Choose from: {list(model_map.keys())}")

        repo_id = model_map[model_size]

        # Load processor and model with custom cache directory
        self.processor = AutoImageProcessor.from_pretrained(
            repo_id,
            cache_dir=cache_dir
        )
        self.model = AutoModelForDepthEstimation.from_pretrained(
            repo_id,
            cache_dir=cache_dir
        )

        # Move model to device
        self.model.to(self.device)
        self.model.eval()

        print(f"[+] Model loaded successfully!")
        print(f"[+] Cached in: {cache_dir}")

    def predict(self, image):
        """
        Predict depth map for an image

        Args:
            image: numpy array (H, W, 3) in RGB format

        Returns:
            depth: numpy array (H, W) with depth values [0, 1]
        """
        # Convert numpy to PIL if needed
        if isinstance(image, np.ndarray):
            image_pil = Image.fromarray(image)
        else:
            image_pil = image

        # Prepare image
        inputs = self.processor(images=image_pil, return_tensors="pt")

        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Inference
        with torch.no_grad():
            outputs = self.model(**inputs)
            predicted_depth = outputs.predicted_depth

        # Interpolate to original size
        prediction = torch.nn.functional.interpolate(
            predicted_depth.unsqueeze(1),
            size=image_pil.size[::-1],
            mode="bicubic",
            align_corners=False,
        )

        # Convert to numpy and normalize
        depth = prediction.squeeze().cpu().numpy()

        # Normalize to [0, 1]
        depth = (depth - depth.min()) / (depth.max() - depth.min())

        return depth


# Test function
if __name__ == "__main__":
    import cv2

    print("=" * 70)
    print("  Testing Depth-Anything V2 with Transformers")
    print("=" * 70)

    # Create estimator
    estimator = TransformersDepthEstimator(model_size="small")

    # Create test image
    print("[*] Creating test image...")
    test_image = np.random.randint(0, 255, (518, 518, 3), dtype=np.uint8)

    # Predict depth
    print("[*] Running depth estimation...")
    import time
    start = time.time()
    depth = estimator.predict(test_image)
    elapsed = (time.time() - start) * 1000

    print(f"[+] Depth estimation complete!")
    print(f"[+] Processing time: {elapsed:.2f}ms")
    print(f"[+] Output shape: {depth.shape}")
    print(f"[+] Depth range: [{depth.min():.3f}, {depth.max():.3f}]")

    print("\n" + "=" * 70)
    print("  SUCCESS! Real AI Depth Estimation Working!")
    print("=" * 70)
    print("\nYou can now use real AI depth estimation!")
    print("\nTo use in your app:")
    print("  from backend.utils.transformers_depth import TransformersDepthEstimator")
    print("  estimator = TransformersDepthEstimator('small')")
    print("  depth = estimator.predict(image)")
    print("=" * 70)