Spaces:

gpue
/

foundationpose

Sleeping

Georg Claude Sonnet 4.5 commited on Jan 29

Commit

16d53ca

1 Parent(s): 42ce71e

Add depth image support to FoundationPose API

Changes:
- Add depth image upload field to Gradio UI (below query image)
- Update gradio_estimate() to accept and process depth images
- Support 16-bit PNG depth (converts mm to meters)
- Handle depth/RGB size mismatches with automatic resizing
- Update test suite to load RGB+depth test images
- Replace old reference images with single RGB/depth pair

Test updates:
- Load specific rgb_001.jpg and depth_001.png files
- Auto-resize depth to match RGB dimensions
- Print depth statistics (shape, dtype, range)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (19) hide show

app.py +37 -3
tests/reference/t_shape/README.md +0 -63
tests/reference/t_shape/depth_001.png +0 -0
tests/reference/t_shape/image_001.jpg +0 -0
tests/reference/t_shape/image_002.jpg +0 -0
tests/reference/t_shape/image_004.jpg +0 -0
tests/reference/t_shape/image_005.jpg +0 -0
tests/reference/t_shape/image_006.jpg +0 -0
tests/reference/t_shape/image_007.jpg +0 -0
tests/reference/t_shape/image_008.jpg +0 -0
tests/reference/t_shape/image_009.jpg +0 -0
tests/reference/t_shape/image_010.jpg +0 -0
tests/reference/t_shape/image_011.jpg +0 -0
tests/reference/t_shape/image_012.jpg +0 -0
tests/reference/t_shape/image_013.jpg +0 -0
tests/reference/t_shape/image_014.jpg +0 -0
tests/reference/t_shape/image_015.jpg +0 -0
tests/reference/t_shape/{image_003.jpg → rgb_001.jpg} +0 -0
tests/test_estimator.py +42 -27

app.py CHANGED Viewed

@@ -262,12 +262,40 @@ def gradio_initialize_model_free(object_id: str, reference_files: List, fx: floa
         return f"Error: {str(e)}"
-def gradio_estimate(object_id: str, query_image: np.ndarray, fx: float, fy: float, cx: float, cy: float):
     """Gradio wrapper for pose estimation."""
     try:
         if query_image is None:
             return "Error: No query image provided", None, None
         # Prepare camera intrinsics
         camera_intrinsics = {
             "fx": fx,
@@ -280,6 +308,7 @@ def gradio_estimate(object_id: str, query_image: np.ndarray, fx: float, fy: floa
         result = pose_estimator.estimate_pose(
             object_id=object_id,
             query_image=query_image,
             camera_intrinsics=camera_intrinsics
         )
@@ -486,7 +515,12 @@ with gr.Blocks(title="FoundationPose Inference", theme=gr.themes.Soft()) as demo
                     )
                     est_query_image = gr.Image(
-                        label="Query Image",
                         type="numpy"
                     )
@@ -511,7 +545,7 @@ with gr.Blocks(title="FoundationPose Inference", theme=gr.themes.Soft()) as demo
             est_button.click(
                 fn=gradio_estimate,
-                inputs=[est_object_id, est_query_image, est_fx, est_fy, est_cx, est_cy],
                 outputs=[est_output, est_viz, est_mask]
             )

         return f"Error: {str(e)}"
+def gradio_estimate(object_id: str, query_image: np.ndarray, depth_image: np.ndarray, fx: float, fy: float, cx: float, cy: float):
     """Gradio wrapper for pose estimation."""
     try:
         if query_image is None:
             return "Error: No query image provided", None, None
+        # Process depth image if provided
+        depth = None
+        if depth_image is not None:
+            # Check if depth needs resizing to match RGB
+            if depth_image.shape[:2] != query_image.shape[:2]:
+                logger.warning(f"Depth {depth_image.shape[:2]} and RGB {query_image.shape[:2]} sizes don't match, resizing depth")
+                depth_image = cv2.resize(depth_image, (query_image.shape[1], query_image.shape[0]), interpolation=cv2.INTER_NEAREST)
+            # Convert to float32 if needed
+            if depth_image.dtype == np.uint16:
+                # Assume 16-bit depth in millimeters
+                depth = depth_image.astype(np.float32) / 1000.0
+                logger.info(f"Converted 16-bit depth to float32, range: [{depth.min():.3f}, {depth.max():.3f}]m")
+            elif depth_image.dtype == np.uint8:
+                # 8-bit depth (encoded), need to decode based on format
+                # For now, assume linear scaling to reasonable depth range
+                depth = depth_image.astype(np.float32) / 255.0 * 5.0  # Map to 0-5m
+                logger.info(f"Converted 8-bit depth to float32, range: [{depth.min():.3f}, {depth.max():.3f}]m")
+            else:
+                # Already float, use as-is
+                depth = depth_image.astype(np.float32)
+                logger.info(f"Using provided depth (dtype={depth_image.dtype}), range: [{depth.min():.3f}, {depth.max():.3f}]m")
+            # Handle color depth images (H, W, 3) - take first channel
+            if len(depth.shape) == 3:
+                logger.warning("Depth image has 3 channels, using first channel")
+                depth = depth[:, :, 0]
         # Prepare camera intrinsics
         camera_intrinsics = {
             "fx": fx,
         result = pose_estimator.estimate_pose(
             object_id=object_id,
             query_image=query_image,
+            depth_image=depth,
             camera_intrinsics=camera_intrinsics
         )
                     )
                     est_query_image = gr.Image(
+                        label="Query Image (RGB)",
+                        type="numpy"
+                    )
+                    est_depth_image = gr.Image(
+                        label="Depth Image (Optional, 16-bit PNG)",
                         type="numpy"
                     )
             est_button.click(
                 fn=gradio_estimate,
+                inputs=[est_object_id, est_query_image, est_depth_image, est_fx, est_fy, est_cx, est_cy],
                 outputs=[est_output, est_viz, est_mask]
             )

tests/reference/t_shape/README.md DELETED Viewed

@@ -1,63 +0,0 @@
-# T-Shaped Object Mesh
-This directory contains a 3D mesh of the T-shaped pushing object from the MuJoCo scene `nova-sim/robots/ur5/model/scene_t_push.xml`.
-## Files
-- `t_shape.obj` - 3D mesh in Wavefront OBJ format
-## Dimensions
-The T-shape consists of two rectangular boxes:
-### Stem (vertical part)
-- Dimensions: 40mm × 140mm × 60mm (width × height × depth)
-- Position: centered at (0, -50mm, 0)
-### Cap (horizontal part)
-- Dimensions: 160mm × 40mm × 60mm
-- Position: centered at (0, 30mm, 0)
-### Overall Bounds
-- X: [-80mm, 80mm] (160mm total width)
-- Y: [-120mm, 50mm] (170mm total height)
-- Z: [-30mm, 30mm] (60mm total depth)
-## Usage
-This mesh can be used with FoundationPose's CAD-based initialization mode for 6D pose estimation of the T-shaped object in the nova-sim push manipulation task.
-### Example Usage
-```python
-from gradio_client import Client, handle_file
-client = Client("https://gpue-foundationpose.hf.space")
-# Initialize with T-shape mesh
-result = client.predict(
-    object_id="t_shape",
-    mesh_file=handle_file("t_shape.obj"),
-    reference_files=[],  # Optional reference images
-    fx=500.0, fy=500.0, cx=320.0, cy=240.0,
-    api_name="/gradio_initialize_cad"
-)
-# Estimate pose in query image
-result = client.predict(
-    object_id="t_shape",
-    query_image=handle_file("camera_frame.jpg"),
-    fx=500.0, fy=500.0, cx=320.0, cy=240.0,
-    api_name="/gradio_estimate"
-)
-```
-## Material Properties (from MuJoCo)
-- Mass: 5.0 kg total (stem: 3.0 kg, cap: 2.0 kg)
-- Friction: 0.3 (sliding), 0.005 (torsional), 0.005 (rolling)
-- Color: Light blue (rgba: 0.55, 0.65, 0.98, 1.0)
-## Generation
-This mesh was automatically generated from the MuJoCo scene definition using a Python script that extracts the box geometries and creates a combined mesh.

tests/reference/t_shape/depth_001.png ADDED Viewed

tests/reference/t_shape/image_001.jpg DELETED Viewed

Binary file (4.79 kB)

tests/reference/t_shape/image_002.jpg DELETED Viewed

Binary file (4.96 kB)

tests/reference/t_shape/image_004.jpg DELETED Viewed

Binary file (4.43 kB)

tests/reference/t_shape/image_005.jpg DELETED Viewed

Binary file (4.91 kB)

tests/reference/t_shape/image_006.jpg DELETED Viewed

Binary file (4.69 kB)

tests/reference/t_shape/image_007.jpg DELETED Viewed

Binary file (4.67 kB)

tests/reference/t_shape/image_008.jpg DELETED Viewed

Binary file (4.86 kB)

tests/reference/t_shape/image_009.jpg DELETED Viewed

Binary file (4.49 kB)

tests/reference/t_shape/image_010.jpg DELETED Viewed

Binary file (4.9 kB)

tests/reference/t_shape/image_011.jpg DELETED Viewed

Binary file (4.3 kB)

tests/reference/t_shape/image_012.jpg DELETED Viewed

Binary file (4.56 kB)

tests/reference/t_shape/image_013.jpg DELETED Viewed

Binary file (4.97 kB)

tests/reference/t_shape/image_014.jpg DELETED Viewed

Binary file (4.64 kB)

tests/reference/t_shape/image_015.jpg DELETED Viewed

Binary file (4.74 kB)

tests/reference/t_shape/{image_003.jpg → rgb_001.jpg} RENAMED Viewed

File without changes

tests/test_estimator.py CHANGED Viewed

@@ -8,7 +8,7 @@ This test verifies that the API can:
 import sys
 from pathlib import Path
-import random
 import cv2
 from gradio_client import Client, handle_file
@@ -18,24 +18,38 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
 from client import FoundationPoseClient
-def load_reference_images(reference_dir: Path):
-    """Load all reference images from directory."""
-    # Get all jpg and png files, excluding mesh files
-    image_files = sorted([
-        f for f in reference_dir.glob("*")
-        if f.suffix.lower() in ['.jpg', '.png']
-    ])
-    images = []
-    for img_path in image_files:
-        # Use cv2 to load images (same as client.py)
-        img = cv2.imread(str(img_path))
-        if img is None:
-            continue
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-        images.append(img)
-    return images, image_files
 def test_client_initialization():
@@ -178,11 +192,16 @@ def main():
         return
     print(f"\nUsing T-shape mesh: {mesh_path}")
-    print(f"Using query images from: {reference_dir}")
-    # Load reference images (will be used as query images)
-    reference_images, image_files = load_reference_images(reference_dir)
-    print(f"✓ Loaded {len(reference_images)} query images")
     # Test 1: Initialize API client
     client = test_client_initialization()
@@ -200,12 +219,8 @@ def main():
         print("=" * 60)
         return
-    # Test 3: Estimate pose on a random query image
-    random_idx = random.randint(0, len(reference_images) - 1)
-    query_image = reference_images[random_idx]
-    query_name = image_files[random_idx].name
-    success = test_pose_estimation(client, query_image, query_name)
     # Print final results
     print("\n" + "=" * 60)

 import sys
 from pathlib import Path
+import numpy as np
 import cv2
 from gradio_client import Client, handle_file
 from client import FoundationPoseClient
+def load_test_data(reference_dir: Path):
+    """Load RGB and depth test images from t_shape directory."""
+    rgb_path = reference_dir / "rgb_001.jpg"
+    depth_path = reference_dir / "depth_001.png"
+    # Load RGB image
+    print(f"Loading RGB: {rgb_path}")
+    rgb = cv2.imread(str(rgb_path))
+    if rgb is None:
+        raise FileNotFoundError(f"Could not load RGB image: {rgb_path}")
+    rgb = cv2.cvtColor(rgb, cv2.COLOR_BGR2RGB)
+    # Load depth image (16-bit PNG)
+    print(f"Loading depth: {depth_path}")
+    depth = cv2.imread(str(depth_path), cv2.IMREAD_ANYDEPTH)
+    if depth is None:
+        raise FileNotFoundError(f"Could not load depth image: {depth_path}")
+    # Check if depth needs resizing to match RGB
+    if depth.shape[:2] != rgb.shape[:2]:
+        print(f"⚠ Warning: Depth ({depth.shape[:2]}) and RGB ({rgb.shape[:2]}) sizes don't match")
+        print(f"  Resizing depth to match RGB...")
+        depth = cv2.resize(depth, (rgb.shape[1], rgb.shape[0]), interpolation=cv2.INTER_NEAREST)
+    # Convert depth to meters (assuming it's in mm or similar)
+    # Depth should be float32 in meters for FoundationPose
+    depth = depth.astype(np.float32) / 1000.0  # Convert mm to meters
+    print(f"✓ RGB loaded: shape={rgb.shape}, dtype={rgb.dtype}")
+    print(f"✓ Depth loaded: shape={depth.shape}, dtype={depth.dtype}, range=[{depth.min():.3f}, {depth.max():.3f}]m")
+    return rgb, depth
 def test_client_initialization():
         return
     print(f"\nUsing T-shape mesh: {mesh_path}")
+    print(f"Using test data from: {reference_dir}")
+    # Load test RGB and depth images
+    try:
+        rgb_image, depth_image = load_test_data(reference_dir)
+    except FileNotFoundError as e:
+        print(f"✗ {e}")
+        return
+    print(f"\n⚠ Note: API currently only supports RGB (depth support coming soon)")
     # Test 1: Initialize API client
     client = test_client_initialization()
         print("=" * 60)
         return
+    # Test 3: Estimate pose using RGB image
+    success = test_pose_estimation(client, rgb_image, "rgb_001.jpg")
     # Print final results
     print("\n" + "=" * 60)