Spaces:

Aedelon
/

rgbd-depth

Sleeping

App Files Files Community

github-actions[bot] commited on Nov 26, 2025

Commit

b4dbe35

1 Parent(s): 0567380

Sync from GitHub: f84ca3dac3c962b2c71c590ad187e2352331038b

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +41 -94

README.md CHANGED Viewed

@@ -26,7 +26,7 @@ Optimized Python package for RGB-D depth refinement using Vision Transformer enc
 [![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/resolve/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/Aedelon/rgbd-depth)
-Try **rgbd-depth** directly in your browser with our interactive Gradio demo—no installation required.
 **Available on Hugging Face Spaces:** Upload your RGB and depth images, adjust parameters (camera model, precision, resolution), and get refined depth maps instantly. Models are automatically downloaded from Hugging Face Hub on first use.

 [![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/resolve/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/Aedelon/rgbd-depth)
+Try **rgbd-depth** directly in your browser with our interactive Gradio demo—no installation required. Upload your images and refine depth maps instantly.
 **Available on Hugging Face Spaces:** Upload your RGB and depth images, adjust parameters (camera model, precision, resolution), and get refined depth maps instantly. Models are automatically downloaded from Hugging Face Hub on first use.

app.py CHANGED Viewed

@@ -8,102 +8,48 @@ import gradio as gr
 import numpy as np
 import torch
 from PIL import Image
-from pathlib import Path
 from rgbddepth import RGBDDepth
 # Global model cache
 MODELS = {}
-# Model mappings from HuggingFace (all are vitl encoder)
-# Format: "camera_model": ("repo_id", "checkpoint_filename")
-HF_MODELS = {
-    "d435": ("depth-anything/camera-depth-model-d435", "cdm_d435.ckpt"),
-    "d405": ("depth-anything/camera-depth-model-d405", "cdm_d405.ckpt"),
-    "l515": ("depth-anything/camera-depth-model-l515", "cdm_l515.ckpt"),
-    "zed2i": ("depth-anything/camera-depth-model-zed2i", "cdm_zed2i.ckpt"),
-}
-# Default model
-DEFAULT_MODEL = "d435"
-def download_model(camera_model: str = DEFAULT_MODEL):
-    """Download model from HuggingFace Hub."""
-    try:
-        from huggingface_hub import hf_hub_download
-        repo_id, filename = HF_MODELS.get(camera_model, HF_MODELS[DEFAULT_MODEL])
-        print(f"📥 Downloading {camera_model} model from {repo_id}/{filename}...")
-        # Download the checkpoint
-        checkpoint_path = hf_hub_download(
-            repo_id=repo_id,
-            filename=filename,
-            cache_dir=".cache"
-        )
-        print(f"✓ Downloaded to {checkpoint_path}")
-        return checkpoint_path
-    except Exception as e:
-        print(f"❌ Failed to download model: {e}")
-        return None
-def load_model(camera_model: str = DEFAULT_MODEL, use_xformers: bool = False):
-    """Load model with automatic download from HuggingFace."""
-    cache_key = f"{camera_model}_{use_xformers}"
     if cache_key not in MODELS:
-        # All HF models use vitl encoder
-        config = {
-            "encoder": "vitl",
-            "features": 256,
-            "out_channels": [256, 512, 1024, 1024],
-            "use_xformers": use_xformers,
         }
-        model = RGBDDepth(**config)
-        # Try to load weights
-        checkpoint_path = None
-        # 1. Try local checkpoints/ directory first
-        local_path = Path(f"checkpoints/{camera_model}.pt")
-        if local_path.exists():
-            checkpoint_path = str(local_path)
-            print(f"✓ Using local checkpoint: {checkpoint_path}")
-        else:
-            # 2. Download from HuggingFace
-            checkpoint_path = download_model(camera_model)
-        # Load checkpoint if available
-        if checkpoint_path:
-            try:
-                checkpoint = torch.load(checkpoint_path, map_location="cpu")
-                if "model" in checkpoint:
-                    states = {k[7:]: v for k, v in checkpoint["model"].items()}
-                elif "state_dict" in checkpoint:
-                    states = {k[9:]: v for k, v in checkpoint["state_dict"].items()}
-                else:
-                    states = checkpoint
-                model.load_state_dict(states, strict=False)
-                print(f"✓ Loaded checkpoint for {camera_model}")
-            except Exception as e:
-                print(f"⚠ Failed to load checkpoint: {e}, using random weights")
-        else:
-            print(f"⚠ No checkpoint available for {camera_model}, using random weights (demo only)")
-        # Move to GPU if available (CUDA or MPS for macOS)
-        if torch.cuda.is_available():
-            device = "cuda"
-        elif torch.backends.mps.is_available():
-            device = "mps"
-        else:
-            device = "cpu"
         model = model.to(device).eval()
         MODELS[cache_key] = model
@@ -114,7 +60,7 @@ def load_model(camera_model: str = DEFAULT_MODEL, use_xformers: bool = False):
 def process_depth(
     rgb_image: np.ndarray,
     depth_image: np.ndarray,
-    camera_model: str = DEFAULT_MODEL,
     input_size: int = 518,
     depth_scale: float = 1000.0,
     max_depth: float = 25.0,
@@ -127,7 +73,7 @@ def process_depth(
     Args:
         rgb_image: RGB image as numpy array [H, W, 3]
         depth_image: Depth image as numpy array [H, W] or [H, W, 3]
-        camera_model: Camera model to use (d435, d405, l515, zed2i)
         input_size: Input size for inference
         depth_scale: Scale factor for depth values
         max_depth: Maximum valid depth value
@@ -159,7 +105,7 @@ def process_depth(
         simi_depth[valid_mask] = 1.0 / depth_normalized[valid_mask]
         # Load model
-        model = load_model(camera_model, use_xformers and torch.cuda.is_available())
         device = next(model.parameters()).device
         # Determine precision
@@ -209,7 +155,7 @@ def process_depth(
         info = f"""
 ✅ **Refinement complete!**
-**Camera Model:** {camera_model.upper()}
 **Precision:** {precision.upper()}
 **Device:** {device.type.upper()}
 **Input size:** {input_size}px
@@ -230,9 +176,10 @@ with gr.Blocks(title="rgbd-depth Demo") as demo:
     High-quality depth map refinement using Vision Transformers. Based on [ByteDance's camera-depth-models](https://manipulation-as-in-simulation.github.io/).
-    📥 **Models are automatically downloaded from Hugging Face on first use!**
-    Choose your camera model (D435, D405, L515, or ZED 2i) and the trained weights will be downloaded automatically.
     """)
     with gr.Row():
@@ -252,11 +199,11 @@ with gr.Blocks(title="rgbd-depth Demo") as demo:
             )
             with gr.Accordion("⚙️ Advanced Settings", open=False):
-                camera_choice = gr.Dropdown(
-                    choices=["d435", "d405", "l515", "zed2i"],
-                    value=DEFAULT_MODEL,
-                    label="Camera Model",
-                    info="Choose the camera model for trained weights (auto-downloads from HF)",
                 )
                 input_size = gr.Slider(
@@ -329,7 +276,7 @@ with gr.Blocks(title="rgbd-depth Demo") as demo:
         inputs=[
             rgb_input,
             depth_input,
-            camera_choice,
             input_size,
             depth_scale,
             max_depth,

 import numpy as np
 import torch
 from PIL import Image
 from rgbddepth import RGBDDepth
 # Global model cache
 MODELS = {}
+def load_model(encoder: str, use_xformers: bool = False):
+    """Load model with caching."""
+    cache_key = f"{encoder}_{use_xformers}"
     if cache_key not in MODELS:
+        # Model configs
+        configs = {
+            "vits": {"encoder": "vits", "features": 64, "out_channels": [48, 96, 192, 384]},
+            "vitb": {"encoder": "vitb", "features": 128, "out_channels": [96, 192, 384, 768]},
+            "vitl": {"encoder": "vitl", "features": 256, "out_channels": [256, 512, 1024, 1024]},
+            "vitg": {"encoder": "vitg", "features": 384, "out_channels": [1536, 1536, 1536, 1536]},
         }
+        config = configs[encoder].copy()
+        config["use_xformers"] = use_xformers
+        model = RGBDDepth(**config)
+        # Try to load weights if checkpoint exists
+        try:
+            checkpoint = torch.load(f"checkpoints/{encoder}.pt", map_location="cpu")
+            if "model" in checkpoint:
+                states = {k[7:]: v for k, v in checkpoint["model"].items()}
+            elif "state_dict" in checkpoint:
+                states = {k[9:]: v for k, v in checkpoint["state_dict"].items()}
+            else:
+                states = checkpoint
+            model.load_state_dict(states, strict=False)
+            print(f"✓ Loaded checkpoint for {encoder}")
+        except FileNotFoundError:
+            print(f"⚠ No checkpoint found for {encoder}, using random weights (demo only)")
+        # Move to GPU if available
+        device = "cuda" if torch.cuda.is_available() else "cpu"
         model = model.to(device).eval()
         MODELS[cache_key] = model
 def process_depth(
     rgb_image: np.ndarray,
     depth_image: np.ndarray,
+    encoder: str = "vitl",
     input_size: int = 518,
     depth_scale: float = 1000.0,
     max_depth: float = 25.0,
     Args:
         rgb_image: RGB image as numpy array [H, W, 3]
         depth_image: Depth image as numpy array [H, W] or [H, W, 3]
+        encoder: Model encoder type
         input_size: Input size for inference
         depth_scale: Scale factor for depth values
         max_depth: Maximum valid depth value
         simi_depth[valid_mask] = 1.0 / depth_normalized[valid_mask]
         # Load model
+        model = load_model(encoder, use_xformers and torch.cuda.is_available())
         device = next(model.parameters()).device
         # Determine precision
         info = f"""
 ✅ **Refinement complete!**
+**Model:** {encoder.upper()}
 **Precision:** {precision.upper()}
 **Device:** {device.type.upper()}
 **Input size:** {input_size}px
     High-quality depth map refinement using Vision Transformers. Based on [ByteDance's camera-depth-models](https://manipulation-as-in-simulation.github.io/).
+    ⚠️ **Note:** This demo uses random weights for demonstration. For real results:
+    1. Download checkpoints from [Hugging Face](https://huggingface.co/collections/depth-anything/camera-depth-models-68b521181dedd223f4b020db)
+    2. Place in `checkpoints/` directory
+    3. Restart the app
     """)
     with gr.Row():
             )
             with gr.Accordion("⚙️ Advanced Settings", open=False):
+                encoder_choice = gr.Radio(
+                    choices=["vits", "vitb", "vitl", "vitg"],
+                    value="vitl",
+                    label="Encoder Model",
+                    info="Larger = better quality but slower",
                 )
                 input_size = gr.Slider(
         inputs=[
             rgb_input,
             depth_input,
+            encoder_choice,
             input_size,
             depth_scale,
             max_depth,