| """ |
| FoundationPose 6D Object Pose Estimation Demo |
| |
| A polished Gradio interface for NVIDIA FoundationPose — the #1 method on the |
| BOP Challenge 2024 benchmark for model-based 6D object localization of unseen objects. |
| |
| This app connects to a FoundationPose inference backend and provides: |
| - CAD-based (model-based) initialization with a 3D mesh |
| - Automatic object masking via SlimSAM |
| - 6D pose estimation (position + orientation) |
| - 3D pose visualization overlaid on the image |
| """ |
|
|
| import io |
| import logging |
| import math |
| import tempfile |
| import time |
| from pathlib import Path |
| from typing import Dict, List, Optional, Tuple |
|
|
| import cv2 |
| import gradio as gr |
| import numpy as np |
| from PIL import Image |
|
|
| logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s: %(message)s") |
| logger = logging.getLogger(__name__) |
|
|
| |
| BACKEND_URL = "https://gpue-foundationpose.hf.space" |
| _gradio_client = None |
|
|
|
|
| def _get_client(): |
| """Lazy-load Gradio client to the FoundationPose backend.""" |
| global _gradio_client |
| if _gradio_client is None: |
| from gradio_client import Client |
| logger.info(f"Connecting to FoundationPose backend at {BACKEND_URL}...") |
| _gradio_client = Client(BACKEND_URL) |
| logger.info("Connected.") |
| return _gradio_client |
|
|
|
|
| |
|
|
| def draw_pose_axes( |
| image: np.ndarray, |
| pose_matrix: np.ndarray, |
| K: np.ndarray, |
| axis_length: float = 0.05, |
| thickness: int = 3, |
| ) -> np.ndarray: |
| """Draw 3D coordinate axes on the image from a 4x4 pose matrix. |
| |
| Red = X, Green = Y, Blue = Z. |
| """ |
| vis = image.copy() |
| R = pose_matrix[:3, :3] |
| t = pose_matrix[:3, 3] |
|
|
| |
| origin = t.reshape(3, 1) |
| axes_3d = origin + R @ (np.eye(3) * axis_length) |
|
|
| |
| def project(pt3d): |
| p = (K @ pt3d).flatten() |
| if abs(p[2]) < 1e-6: |
| return None |
| return int(p[0] / p[2]), int(p[1] / p[2]) |
|
|
| o2d = project(origin) |
| if o2d is None: |
| return vis |
|
|
| colors = [(0, 0, 255), (0, 255, 0), (255, 0, 0)] |
| labels = ["X", "Y", "Z"] |
|
|
| for i in range(3): |
| end = project(axes_3d[:, i:i + 1]) |
| if end is None: |
| continue |
| cv2.arrowedLine(vis, o2d, end, colors[i], thickness, tipLength=0.2) |
| cv2.putText(vis, labels[i], (end[0] + 5, end[1] - 5), |
| cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[i], 2) |
|
|
| |
| cv2.circle(vis, o2d, 5, (255, 255, 255), -1) |
| cv2.circle(vis, o2d, 5, (0, 0, 0), 2) |
|
|
| return vis |
|
|
|
|
| def draw_bounding_box_from_pose( |
| image: np.ndarray, |
| pose_matrix: np.ndarray, |
| K: np.ndarray, |
| size: float = 0.03, |
| ) -> np.ndarray: |
| """Draw a projected 3D bounding box around the object.""" |
| vis = image.copy() |
| R = pose_matrix[:3, :3] |
| t = pose_matrix[:3, 3] |
|
|
| |
| corners = np.array([ |
| [-1, -1, -1], [1, -1, -1], [1, 1, -1], [-1, 1, -1], |
| [-1, -1, 1], [1, -1, 1], [1, 1, 1], [-1, 1, 1], |
| ], dtype=np.float64) * size |
|
|
| |
| corners_cam = (R @ corners.T + t.reshape(3, 1)).T |
|
|
| |
| def project(pt3d): |
| p = (K @ pt3d.reshape(3, 1)).flatten() |
| if abs(p[2]) < 1e-6: |
| return None |
| return int(p[0] / p[2]), int(p[1] / p[2]) |
|
|
| pts_2d = [project(c) for c in corners_cam] |
| if any(p is None for p in pts_2d): |
| return vis |
|
|
| |
| edges = [ |
| (0, 1), (1, 2), (2, 3), (3, 0), |
| (4, 5), (5, 6), (6, 7), (7, 4), |
| (0, 4), (1, 5), (2, 6), (3, 7), |
| ] |
| for i, j in edges: |
| cv2.line(vis, pts_2d[i], pts_2d[j], (0, 255, 255), 2) |
|
|
| return vis |
|
|
|
|
| def quat_to_euler(w, x, y, z) -> Tuple[float, float, float]: |
| """Convert quaternion to Euler angles (roll, pitch, yaw) in degrees.""" |
| |
| sinr_cosp = 2 * (w * x + y * z) |
| cosr_cosp = 1 - 2 * (x * x + y * y) |
| roll = math.atan2(sinr_cosp, cosr_cosp) |
|
|
| |
| sinp = 2 * (w * y - z * x) |
| if abs(sinp) >= 1: |
| pitch = math.copysign(math.pi / 2, sinp) |
| else: |
| pitch = math.asin(sinp) |
|
|
| |
| siny_cosp = 2 * (w * z + x * y) |
| cosy_cosp = 1 - 2 * (y * y + z * z) |
| yaw = math.atan2(siny_cosp, cosy_cosp) |
|
|
| return math.degrees(roll), math.degrees(pitch), math.degrees(yaw) |
|
|
|
|
| def format_pose_result(pose: Dict) -> str: |
| """Format a pose result into a readable string.""" |
| lines = [] |
| pos = pose.get("position", {}) |
| ori = pose.get("orientation", {}) |
|
|
| lines.append("━━━ Position (meters) ━━━") |
| lines.append(f" X: {pos.get('x', 0):+.4f}") |
| lines.append(f" Y: {pos.get('y', 0):+.4f}") |
| lines.append(f" Z: {pos.get('z', 0):+.4f}") |
|
|
| lines.append("") |
| lines.append("━━━ Orientation (quaternion) ━━━") |
| lines.append(f" W: {ori.get('w', 0):+.6f}") |
| lines.append(f" X: {ori.get('x', 0):+.6f}") |
| lines.append(f" Y: {ori.get('y', 0):+.6f}") |
| lines.append(f" Z: {ori.get('z', 0):+.6f}") |
|
|
| |
| roll, pitch, yaw = quat_to_euler( |
| ori.get('w', 1), ori.get('x', 0), |
| ori.get('y', 0), ori.get('z', 0) |
| ) |
| lines.append("") |
| lines.append("━━━ Euler Angles (degrees) ━━━") |
| lines.append(f" Roll: {roll:+.2f}°") |
| lines.append(f" Pitch: {pitch:+.2f}°") |
| lines.append(f" Yaw: {yaw:+.2f}°") |
|
|
| if "confidence" in pose: |
| lines.append("") |
| lines.append(f"━━━ Confidence: {pose['confidence']:.2%} ━━━") |
|
|
| return "\n".join(lines) |
|
|
|
|
| |
|
|
| def initialize_object( |
| object_id: str, |
| mesh_file, |
| reference_files: List, |
| fx: float, |
| fy: float, |
| cx: float, |
| cy: float, |
| ): |
| """Initialize an object with a CAD mesh + optional reference images.""" |
| if not object_id: |
| return "❌ Please provide an Object ID" |
| if not mesh_file: |
| return "❌ Please upload a 3D mesh file (.obj, .stl, .ply)" |
|
|
| try: |
| from gradio_client import handle_file |
|
|
| client = _get_client() |
|
|
| |
| ref_handles = [] |
| if reference_files: |
| for f in reference_files: |
| if hasattr(f, 'name'): |
| ref_handles.append(handle_file(f.name)) |
| elif isinstance(f, str): |
| ref_handles.append(handle_file(f)) |
|
|
| mesh_handle = handle_file(mesh_file.name) if hasattr(mesh_file, 'name') else handle_file(mesh_file) |
|
|
| result = client.predict( |
| object_id, |
| mesh_handle, |
| ref_handles if ref_handles else None, |
| fx, fy, cx, cy, |
| api_name="/gradio_initialize_cad", |
| ) |
|
|
| return f"✅ {result}" |
|
|
| except Exception as e: |
| logger.error(f"Initialization error: {e}", exc_info=True) |
| return f"❌ Error: {str(e)}" |
|
|
|
|
| def estimate_pose( |
| object_id: str, |
| query_image: np.ndarray, |
| depth_image: Optional[np.ndarray], |
| fx: float, |
| fy: float, |
| cx: float, |
| cy: float, |
| mask_method: str, |
| ): |
| """Estimate 6D pose and return visualization.""" |
| if query_image is None: |
| return "❌ Please upload a query image", None |
|
|
| if not object_id: |
| return "❌ Please provide the Object ID (must match initialization)", None |
|
|
| try: |
| from gradio_client import handle_file |
|
|
| client = _get_client() |
|
|
| |
| with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: |
| Image.fromarray(query_image).save(f.name) |
| query_path = f.name |
|
|
| |
| depth_path = None |
| if depth_image is not None: |
| with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: |
| Image.fromarray(depth_image).save(f.name) |
| depth_path = f.name |
|
|
| |
| result = client.predict( |
| object_id, |
| handle_file(query_path), |
| handle_file(depth_path) if depth_path else None, |
| fx, fy, cx, cy, |
| mask_method, |
| None, |
| api_name="/gradio_estimate", |
| ) |
|
|
| |
| if isinstance(result, (list, tuple)): |
| text_result = result[0] if len(result) > 0 else "" |
| viz_path = result[1] if len(result) > 1 else None |
| mask_path = result[2] if len(result) > 2 else None |
| else: |
| text_result = str(result) |
| viz_path = None |
| mask_path = None |
|
|
| |
| vis_image = query_image.copy() |
|
|
| |
| pose_info = _parse_pose_text(text_result) |
|
|
| if pose_info: |
| K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float64) |
|
|
| |
| if "pose_matrix" in pose_info: |
| pose_mat = np.array(pose_info["pose_matrix"]) |
| vis_image = draw_pose_axes(vis_image, pose_mat, K, axis_length=0.05) |
| vis_image = draw_bounding_box_from_pose(vis_image, pose_mat, K, size=0.03) |
| elif "position" in pose_info: |
| |
| pos = pose_info["position"] |
| ori = pose_info.get("orientation", {"w": 1, "x": 0, "y": 0, "z": 0}) |
| pose_mat = _quat_pos_to_matrix(pos, ori) |
| vis_image = draw_pose_axes(vis_image, pose_mat, K, axis_length=0.05) |
| vis_image = draw_bounding_box_from_pose(vis_image, pose_mat, K, size=0.03) |
|
|
| formatted = format_pose_result(pose_info) |
| return f"✅ Pose Estimated Successfully\n\n{formatted}", vis_image |
| else: |
| |
| return text_result, vis_image |
|
|
| except Exception as e: |
| logger.error(f"Pose estimation error: {e}", exc_info=True) |
| return f"❌ Error: {str(e)}", query_image |
|
|
|
|
| def _parse_pose_text(text: str) -> Optional[Dict]: |
| """Parse pose information from the backend's text output.""" |
| if not text or "No poses" in text or "failed" in text.lower() or "error" in text.lower(): |
| return None |
|
|
| pose = {} |
| lines = text.strip().split("\n") |
|
|
| position = {} |
| orientation = {} |
| in_position = False |
| in_orientation = False |
|
|
| for line in lines: |
| line = line.strip() |
| if "Position:" in line: |
| in_position = True |
| in_orientation = False |
| continue |
| if "Orientation" in line: |
| in_position = False |
| in_orientation = True |
| continue |
| if "Confidence" in line: |
| in_position = False |
| in_orientation = False |
| try: |
| val = line.split(":")[-1].strip().rstrip("%") |
| pose["confidence"] = float(val) / 100 if "%" in line else float(val) |
| except (ValueError, IndexError): |
| pass |
| continue |
|
|
| if in_position: |
| if "x:" in line: |
| try: |
| position["x"] = float(line.split(":")[-1].strip().split()[0]) |
| except (ValueError, IndexError): |
| pass |
| elif "y:" in line: |
| try: |
| position["y"] = float(line.split(":")[-1].strip().split()[0]) |
| except (ValueError, IndexError): |
| pass |
| elif "z:" in line: |
| try: |
| position["z"] = float(line.split(":")[-1].strip().split()[0]) |
| except (ValueError, IndexError): |
| pass |
|
|
| if in_orientation: |
| if "w:" in line: |
| try: |
| orientation["w"] = float(line.split(":")[-1].strip()) |
| except (ValueError, IndexError): |
| pass |
| elif "x:" in line: |
| try: |
| orientation["x"] = float(line.split(":")[-1].strip()) |
| except (ValueError, IndexError): |
| pass |
| elif "y:" in line: |
| try: |
| orientation["y"] = float(line.split(":")[-1].strip()) |
| except (ValueError, IndexError): |
| pass |
| elif "z:" in line: |
| try: |
| orientation["z"] = float(line.split(":")[-1].strip()) |
| except (ValueError, IndexError): |
| pass |
|
|
| if position: |
| pose["position"] = position |
| if orientation: |
| pose["orientation"] = orientation |
|
|
| return pose if pose else None |
|
|
|
|
| def _quat_pos_to_matrix(pos: Dict, ori: Dict) -> np.ndarray: |
| """Convert position + quaternion to a 4x4 transformation matrix.""" |
| w, x, y, z = ori.get("w", 1), ori.get("x", 0), ori.get("y", 0), ori.get("z", 0) |
|
|
| |
| R = np.array([ |
| [1 - 2*(y*y + z*z), 2*(x*y - w*z), 2*(x*z + w*y)], |
| [2*(x*y + w*z), 1 - 2*(x*x + z*z), 2*(y*z - w*x)], |
| [2*(x*z - w*y), 2*(y*z + w*x), 1 - 2*(x*x + y*y)], |
| ], dtype=np.float64) |
|
|
| T = np.eye(4, dtype=np.float64) |
| T[:3, :3] = R |
| T[0, 3] = pos.get("x", 0) |
| T[1, 3] = pos.get("y", 0) |
| T[2, 3] = pos.get("z", 0) |
| return T |
|
|
|
|
| |
|
|
| DESCRIPTION = """ |
| # 🎯 FoundationPose — 6D Object Pose Estimation |
| |
| **[FoundationPose](https://nvlabs.github.io/FoundationPose/)** by NVIDIA is the **#1 method** on the |
| [BOP Challenge 2024](https://bop.felk.cvut.cz/) benchmark for model-based 6D localization of unseen objects, |
| achieving an **AR score of 73.4** across 7 core datasets (LM-O, T-LESS, TUD-L, IC-BIN, ITODD, HB, YCB-V). |
| |
| ### How it works |
| 1. **Initialize**: Upload a 3D mesh (.obj/.stl/.ply) of your object and optionally reference RGB images |
| 2. **Estimate**: Upload a query RGB image (+ optional depth) and the model estimates the full 6D pose |
| 3. **Visualize**: See the projected 3D axes and bounding box overlaid on the image |
| |
| The pose output is a 4×4 transformation matrix (rotation + translation) from object frame to camera frame. |
| |
| | Metric | Value | |
| |--------|-------| |
| | BOP AR Score | **73.4** | |
| | BOP Rank (2024) | **#1** (model-based unseen) | |
| | Paper | [CVPR 2024](https://arxiv.org/abs/2312.08344) | |
| | Input | RGB-D + CAD mesh | |
| """ |
|
|
| INIT_HELP = """ |
| ### 📋 Initialization Guide |
| |
| **Required:** |
| - **Object ID**: A unique name for your object (e.g., "mug", "wrench") |
| - **3D Mesh**: Upload an `.obj`, `.stl`, or `.ply` file of the object |
| |
| **Optional but recommended:** |
| - **Reference Images**: 1+ RGB images of the object from known viewpoints |
| - **Camera Intrinsics**: Focal lengths (fx, fy) and principal point (cx, cy) |
| |
| > 💡 **Tip**: The default intrinsics work for the bundled test data. For your own images, |
| > use the calibration values from your camera. |
| """ |
|
|
| ESTIMATE_HELP = """ |
| ### 📋 Estimation Guide |
| |
| - **Query Image**: An RGB image containing the initialized object |
| - **Depth Image**: Optional 16-bit depth map (improves accuracy significantly) |
| - **Mask Method**: |
| - `SlimSAM` — automatic segmentation (recommended) |
| - `Otsu` — simple brightness-based thresholding |
| |
| > ⚠️ **Important**: Camera intrinsics must match the query image resolution. |
| > If you resize the image, scale fx/fy/cx/cy proportionally. |
| """ |
|
|
|
|
| def build_ui(): |
| with gr.Blocks( |
| title="FoundationPose 6D Pose Estimation", |
| theme=gr.themes.Soft(), |
| css=""" |
| .pose-output { font-family: monospace; } |
| .gr-button-primary { background: #6366f1 !important; } |
| """, |
| ) as demo: |
| gr.Markdown(DESCRIPTION) |
|
|
| with gr.Tabs(): |
| |
| with gr.Tab("① Initialize Object", id="init"): |
| gr.Markdown(INIT_HELP) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| init_object_id = gr.Textbox( |
| label="Object ID", |
| placeholder="e.g., target_cube", |
| value="target_cube", |
| ) |
| init_mesh = gr.File( |
| label="3D Mesh (.obj / .stl / .ply)", |
| file_count="single", |
| file_types=[".obj", ".stl", ".ply", ".mesh"], |
| ) |
| init_refs = gr.File( |
| label="Reference Images (optional)", |
| file_count="multiple", |
| file_types=["image"], |
| ) |
|
|
| gr.Markdown("#### Camera Intrinsics") |
| with gr.Row(): |
| init_fx = gr.Number(label="fx", value=193.137, precision=3) |
| init_fy = gr.Number(label="fy", value=193.137, precision=3) |
| with gr.Row(): |
| init_cx = gr.Number(label="cx", value=120.0, precision=1) |
| init_cy = gr.Number(label="cy", value=80.0, precision=1) |
|
|
| init_btn = gr.Button("🚀 Initialize Object", variant="primary", size="lg") |
|
|
| with gr.Column(scale=1): |
| init_result = gr.Textbox( |
| label="Result", |
| lines=6, |
| interactive=False, |
| elem_classes=["pose-output"], |
| ) |
|
|
| init_btn.click( |
| fn=initialize_object, |
| inputs=[init_object_id, init_mesh, init_refs, init_fx, init_fy, init_cx, init_cy], |
| outputs=init_result, |
| ) |
|
|
| |
| with gr.Tab("② Estimate Pose", id="estimate"): |
| gr.Markdown(ESTIMATE_HELP) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| est_object_id = gr.Textbox( |
| label="Object ID", |
| placeholder="Must match initialization", |
| value="target_cube", |
| ) |
| est_query = gr.Image( |
| label="Query Image (RGB)", |
| type="numpy", |
| ) |
| est_depth = gr.Image( |
| label="Depth Image (optional, 16-bit PNG)", |
| type="numpy", |
| ) |
| est_mask = gr.Radio( |
| choices=["SlimSAM", "Otsu"], |
| value="SlimSAM", |
| label="Mask Method", |
| ) |
|
|
| gr.Markdown("#### Camera Intrinsics") |
| with gr.Row(): |
| est_fx = gr.Number(label="fx", value=193.137, precision=3) |
| est_fy = gr.Number(label="fy", value=193.137, precision=3) |
| with gr.Row(): |
| est_cx = gr.Number(label="cx", value=120.0, precision=1) |
| est_cy = gr.Number(label="cy", value=80.0, precision=1) |
|
|
| est_btn = gr.Button("🎯 Estimate Pose", variant="primary", size="lg") |
|
|
| with gr.Column(scale=1): |
| est_viz = gr.Image( |
| label="Pose Visualization (axes + bounding box)", |
| type="numpy", |
| ) |
| est_result = gr.Textbox( |
| label="Pose Output", |
| lines=18, |
| interactive=False, |
| elem_classes=["pose-output"], |
| ) |
|
|
| est_btn.click( |
| fn=estimate_pose, |
| inputs=[est_object_id, est_query, est_depth, est_fx, est_fy, est_cx, est_cy, est_mask], |
| outputs=[est_result, est_viz], |
| ) |
|
|
| |
| with gr.Tab("ℹ️ About"): |
| gr.Markdown(""" |
| ## Architecture |
| |
| FoundationPose uses a **two-stage pipeline**: |
| |
| 1. **Pose Hypothesis Generation**: Generates 42 coarse pose hypotheses from uniformly sampled viewpoints |
| 2. **Transformer-based Refinement**: A ResNet-34 backbone with 4-head attention refines each hypothesis |
| 3. **Contrastive Ranking**: InfoNCE loss ranks hypotheses, selecting the best pose |
| |
| ### Training Data |
| - **600K synthetic scenes** rendered on Objaverse objects with LLM-aided texture augmentation |
| - **1.2M training images** — no real-world training data needed |
| |
| ### Two Modes |
| - **Model-Based**: Uses a CAD mesh for precise render-and-compare |
| - **Model-Free**: Reconstructs a NeRF from 16-20 reference images |
| |
| ## BOP Challenge 2024 Results |
| |
| | Dataset | AR Score | |
| |---------|----------| |
| | LM-O | 75.6 | |
| | T-LESS | 64.6 | |
| | TUD-L | 92.3 | |
| | IC-BIN | 50.8 | |
| | ITODD | 58.0 | |
| | HB | 83.5 | |
| | YCB-V | 88.9 | |
| | **Average** | **73.4** | |
| |
| ## Citation |
| |
| ```bibtex |
| @inproceedings{wen2024foundationpose, |
| title={FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects}, |
| author={Wen, Bowen and Yang, Wei and Kautz, Jan and Birchfield, Stan}, |
| booktitle={CVPR}, |
| year={2024} |
| } |
| ``` |
| |
| ## Links |
| - [Paper (arXiv)](https://arxiv.org/abs/2312.08344) |
| - [Project Page](https://nvlabs.github.io/FoundationPose/) |
| - [GitHub](https://github.com/NVlabs/FoundationPose) |
| - [Model Weights (HF Hub)](https://huggingface.co/gpue/foundationpose-weights) |
| - [Backend Space](https://huggingface.co/spaces/gpue/foundationpose) |
| - [BOP Challenge](https://bop.felk.cvut.cz/) |
| """) |
|
|
| gr.Markdown(""" |
| --- |
| <center> |
| |
| Built with ❤️ using [FoundationPose](https://nvlabs.github.io/FoundationPose/) by NVIDIA |
| and [Gradio](https://gradio.app) — Powered by the |
| [FoundationPose backend Space](https://huggingface.co/spaces/gpue/foundationpose) |
| |
| </center> |
| """) |
|
|
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| demo = build_ui() |
| demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|