""" FoundationPose 6D Object Pose Estimation Demo A polished Gradio interface for NVIDIA FoundationPose — the #1 method on the BOP Challenge 2024 benchmark for model-based 6D object localization of unseen objects. This app connects to a FoundationPose inference backend and provides: - CAD-based (model-based) initialization with a 3D mesh - Automatic object masking via SlimSAM - 6D pose estimation (position + orientation) - 3D pose visualization overlaid on the image """ import io import logging import math import tempfile import time from pathlib import Path from typing import Dict, List, Optional, Tuple import cv2 import gradio as gr import numpy as np from PIL import Image logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s: %(message)s") logger = logging.getLogger(__name__) # ── Backend connection ────────────────────────────────────────────────────── BACKEND_URL = "https://gpue-foundationpose.hf.space" _gradio_client = None def _get_client(): """Lazy-load Gradio client to the FoundationPose backend.""" global _gradio_client if _gradio_client is None: from gradio_client import Client logger.info(f"Connecting to FoundationPose backend at {BACKEND_URL}...") _gradio_client = Client(BACKEND_URL) logger.info("Connected.") return _gradio_client # ── Pose visualization ────────────────────────────────────────────────────── def draw_pose_axes( image: np.ndarray, pose_matrix: np.ndarray, K: np.ndarray, axis_length: float = 0.05, thickness: int = 3, ) -> np.ndarray: """Draw 3D coordinate axes on the image from a 4x4 pose matrix. Red = X, Green = Y, Blue = Z. """ vis = image.copy() R = pose_matrix[:3, :3] t = pose_matrix[:3, 3] # Origin and axis endpoints in 3D origin = t.reshape(3, 1) axes_3d = origin + R @ (np.eye(3) * axis_length) # (3, 3) # Project to 2D def project(pt3d): p = (K @ pt3d).flatten() if abs(p[2]) < 1e-6: return None return int(p[0] / p[2]), int(p[1] / p[2]) o2d = project(origin) if o2d is None: return vis colors = [(0, 0, 255), (0, 255, 0), (255, 0, 0)] # R, G, B for X, Y, Z labels = ["X", "Y", "Z"] for i in range(3): end = project(axes_3d[:, i:i + 1]) if end is None: continue cv2.arrowedLine(vis, o2d, end, colors[i], thickness, tipLength=0.2) cv2.putText(vis, labels[i], (end[0] + 5, end[1] - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[i], 2) # Draw origin circle cv2.circle(vis, o2d, 5, (255, 255, 255), -1) cv2.circle(vis, o2d, 5, (0, 0, 0), 2) return vis def draw_bounding_box_from_pose( image: np.ndarray, pose_matrix: np.ndarray, K: np.ndarray, size: float = 0.03, ) -> np.ndarray: """Draw a projected 3D bounding box around the object.""" vis = image.copy() R = pose_matrix[:3, :3] t = pose_matrix[:3, 3] # 8 corners of a cube centered at origin corners = np.array([ [-1, -1, -1], [1, -1, -1], [1, 1, -1], [-1, 1, -1], [-1, -1, 1], [1, -1, 1], [1, 1, 1], [-1, 1, 1], ], dtype=np.float64) * size # Transform to camera frame corners_cam = (R @ corners.T + t.reshape(3, 1)).T # (8, 3) # Project def project(pt3d): p = (K @ pt3d.reshape(3, 1)).flatten() if abs(p[2]) < 1e-6: return None return int(p[0] / p[2]), int(p[1] / p[2]) pts_2d = [project(c) for c in corners_cam] if any(p is None for p in pts_2d): return vis # Draw edges edges = [ (0, 1), (1, 2), (2, 3), (3, 0), # back face (4, 5), (5, 6), (6, 7), (7, 4), # front face (0, 4), (1, 5), (2, 6), (3, 7), # connecting edges ] for i, j in edges: cv2.line(vis, pts_2d[i], pts_2d[j], (0, 255, 255), 2) return vis def quat_to_euler(w, x, y, z) -> Tuple[float, float, float]: """Convert quaternion to Euler angles (roll, pitch, yaw) in degrees.""" # Roll (X-axis rotation) sinr_cosp = 2 * (w * x + y * z) cosr_cosp = 1 - 2 * (x * x + y * y) roll = math.atan2(sinr_cosp, cosr_cosp) # Pitch (Y-axis rotation) sinp = 2 * (w * y - z * x) if abs(sinp) >= 1: pitch = math.copysign(math.pi / 2, sinp) else: pitch = math.asin(sinp) # Yaw (Z-axis rotation) siny_cosp = 2 * (w * z + x * y) cosy_cosp = 1 - 2 * (y * y + z * z) yaw = math.atan2(siny_cosp, cosy_cosp) return math.degrees(roll), math.degrees(pitch), math.degrees(yaw) def format_pose_result(pose: Dict) -> str: """Format a pose result into a readable string.""" lines = [] pos = pose.get("position", {}) ori = pose.get("orientation", {}) lines.append("━━━ Position (meters) ━━━") lines.append(f" X: {pos.get('x', 0):+.4f}") lines.append(f" Y: {pos.get('y', 0):+.4f}") lines.append(f" Z: {pos.get('z', 0):+.4f}") lines.append("") lines.append("━━━ Orientation (quaternion) ━━━") lines.append(f" W: {ori.get('w', 0):+.6f}") lines.append(f" X: {ori.get('x', 0):+.6f}") lines.append(f" Y: {ori.get('y', 0):+.6f}") lines.append(f" Z: {ori.get('z', 0):+.6f}") # Euler angles roll, pitch, yaw = quat_to_euler( ori.get('w', 1), ori.get('x', 0), ori.get('y', 0), ori.get('z', 0) ) lines.append("") lines.append("━━━ Euler Angles (degrees) ━━━") lines.append(f" Roll: {roll:+.2f}°") lines.append(f" Pitch: {pitch:+.2f}°") lines.append(f" Yaw: {yaw:+.2f}°") if "confidence" in pose: lines.append("") lines.append(f"━━━ Confidence: {pose['confidence']:.2%} ━━━") return "\n".join(lines) # ── Core API functions ─────────────────────────────────────────────────────── def initialize_object( object_id: str, mesh_file, reference_files: List, fx: float, fy: float, cx: float, cy: float, ): """Initialize an object with a CAD mesh + optional reference images.""" if not object_id: return "❌ Please provide an Object ID" if not mesh_file: return "❌ Please upload a 3D mesh file (.obj, .stl, .ply)" try: from gradio_client import handle_file client = _get_client() # Prepare reference files ref_handles = [] if reference_files: for f in reference_files: if hasattr(f, 'name'): ref_handles.append(handle_file(f.name)) elif isinstance(f, str): ref_handles.append(handle_file(f)) mesh_handle = handle_file(mesh_file.name) if hasattr(mesh_file, 'name') else handle_file(mesh_file) result = client.predict( object_id, mesh_handle, ref_handles if ref_handles else None, fx, fy, cx, cy, api_name="/gradio_initialize_cad", ) return f"✅ {result}" except Exception as e: logger.error(f"Initialization error: {e}", exc_info=True) return f"❌ Error: {str(e)}" def estimate_pose( object_id: str, query_image: np.ndarray, depth_image: Optional[np.ndarray], fx: float, fy: float, cx: float, cy: float, mask_method: str, ): """Estimate 6D pose and return visualization.""" if query_image is None: return "❌ Please upload a query image", None if not object_id: return "❌ Please provide the Object ID (must match initialization)", None try: from gradio_client import handle_file client = _get_client() # Save query image to temp file with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: Image.fromarray(query_image).save(f.name) query_path = f.name # Save depth image if provided depth_path = None if depth_image is not None: with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: Image.fromarray(depth_image).save(f.name) depth_path = f.name # Call the backend result = client.predict( object_id, handle_file(query_path), handle_file(depth_path) if depth_path else None, fx, fy, cx, cy, mask_method, None, # mask_editor_data api_name="/gradio_estimate", ) # Parse result — the backend returns (text, image_path, mask_path) if isinstance(result, (list, tuple)): text_result = result[0] if len(result) > 0 else "" viz_path = result[1] if len(result) > 1 else None mask_path = result[2] if len(result) > 2 else None else: text_result = str(result) viz_path = None mask_path = None # Build visualization vis_image = query_image.copy() # Try to parse pose from the text result pose_info = _parse_pose_text(text_result) if pose_info: K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float64) # If we have the full pose matrix, visualize it if "pose_matrix" in pose_info: pose_mat = np.array(pose_info["pose_matrix"]) vis_image = draw_pose_axes(vis_image, pose_mat, K, axis_length=0.05) vis_image = draw_bounding_box_from_pose(vis_image, pose_mat, K, size=0.03) elif "position" in pose_info: # Build pose matrix from position + quaternion pos = pose_info["position"] ori = pose_info.get("orientation", {"w": 1, "x": 0, "y": 0, "z": 0}) pose_mat = _quat_pos_to_matrix(pos, ori) vis_image = draw_pose_axes(vis_image, pose_mat, K, axis_length=0.05) vis_image = draw_bounding_box_from_pose(vis_image, pose_mat, K, size=0.03) formatted = format_pose_result(pose_info) return f"✅ Pose Estimated Successfully\n\n{formatted}", vis_image else: # Return raw result from backend return text_result, vis_image except Exception as e: logger.error(f"Pose estimation error: {e}", exc_info=True) return f"❌ Error: {str(e)}", query_image def _parse_pose_text(text: str) -> Optional[Dict]: """Parse pose information from the backend's text output.""" if not text or "No poses" in text or "failed" in text.lower() or "error" in text.lower(): return None pose = {} lines = text.strip().split("\n") position = {} orientation = {} in_position = False in_orientation = False for line in lines: line = line.strip() if "Position:" in line: in_position = True in_orientation = False continue if "Orientation" in line: in_position = False in_orientation = True continue if "Confidence" in line: in_position = False in_orientation = False try: val = line.split(":")[-1].strip().rstrip("%") pose["confidence"] = float(val) / 100 if "%" in line else float(val) except (ValueError, IndexError): pass continue if in_position: if "x:" in line: try: position["x"] = float(line.split(":")[-1].strip().split()[0]) except (ValueError, IndexError): pass elif "y:" in line: try: position["y"] = float(line.split(":")[-1].strip().split()[0]) except (ValueError, IndexError): pass elif "z:" in line: try: position["z"] = float(line.split(":")[-1].strip().split()[0]) except (ValueError, IndexError): pass if in_orientation: if "w:" in line: try: orientation["w"] = float(line.split(":")[-1].strip()) except (ValueError, IndexError): pass elif "x:" in line: try: orientation["x"] = float(line.split(":")[-1].strip()) except (ValueError, IndexError): pass elif "y:" in line: try: orientation["y"] = float(line.split(":")[-1].strip()) except (ValueError, IndexError): pass elif "z:" in line: try: orientation["z"] = float(line.split(":")[-1].strip()) except (ValueError, IndexError): pass if position: pose["position"] = position if orientation: pose["orientation"] = orientation return pose if pose else None def _quat_pos_to_matrix(pos: Dict, ori: Dict) -> np.ndarray: """Convert position + quaternion to a 4x4 transformation matrix.""" w, x, y, z = ori.get("w", 1), ori.get("x", 0), ori.get("y", 0), ori.get("z", 0) # Rotation matrix from quaternion R = np.array([ [1 - 2*(y*y + z*z), 2*(x*y - w*z), 2*(x*z + w*y)], [2*(x*y + w*z), 1 - 2*(x*x + z*z), 2*(y*z - w*x)], [2*(x*z - w*y), 2*(y*z + w*x), 1 - 2*(x*x + y*y)], ], dtype=np.float64) T = np.eye(4, dtype=np.float64) T[:3, :3] = R T[0, 3] = pos.get("x", 0) T[1, 3] = pos.get("y", 0) T[2, 3] = pos.get("z", 0) return T # ── Gradio UI ──────────────────────────────────────────────────────────────── DESCRIPTION = """ # 🎯 FoundationPose — 6D Object Pose Estimation **[FoundationPose](https://nvlabs.github.io/FoundationPose/)** by NVIDIA is the **#1 method** on the [BOP Challenge 2024](https://bop.felk.cvut.cz/) benchmark for model-based 6D localization of unseen objects, achieving an **AR score of 73.4** across 7 core datasets (LM-O, T-LESS, TUD-L, IC-BIN, ITODD, HB, YCB-V). ### How it works 1. **Initialize**: Upload a 3D mesh (.obj/.stl/.ply) of your object and optionally reference RGB images 2. **Estimate**: Upload a query RGB image (+ optional depth) and the model estimates the full 6D pose 3. **Visualize**: See the projected 3D axes and bounding box overlaid on the image The pose output is a 4×4 transformation matrix (rotation + translation) from object frame to camera frame. | Metric | Value | |--------|-------| | BOP AR Score | **73.4** | | BOP Rank (2024) | **#1** (model-based unseen) | | Paper | [CVPR 2024](https://arxiv.org/abs/2312.08344) | | Input | RGB-D + CAD mesh | """ INIT_HELP = """ ### 📋 Initialization Guide **Required:** - **Object ID**: A unique name for your object (e.g., "mug", "wrench") - **3D Mesh**: Upload an `.obj`, `.stl`, or `.ply` file of the object **Optional but recommended:** - **Reference Images**: 1+ RGB images of the object from known viewpoints - **Camera Intrinsics**: Focal lengths (fx, fy) and principal point (cx, cy) > 💡 **Tip**: The default intrinsics work for the bundled test data. For your own images, > use the calibration values from your camera. """ ESTIMATE_HELP = """ ### 📋 Estimation Guide - **Query Image**: An RGB image containing the initialized object - **Depth Image**: Optional 16-bit depth map (improves accuracy significantly) - **Mask Method**: - `SlimSAM` — automatic segmentation (recommended) - `Otsu` — simple brightness-based thresholding > ⚠️ **Important**: Camera intrinsics must match the query image resolution. > If you resize the image, scale fx/fy/cx/cy proportionally. """ def build_ui(): with gr.Blocks( title="FoundationPose 6D Pose Estimation", theme=gr.themes.Soft(), css=""" .pose-output { font-family: monospace; } .gr-button-primary { background: #6366f1 !important; } """, ) as demo: gr.Markdown(DESCRIPTION) with gr.Tabs(): # ── Tab 1: Initialize ───────────────────────────────── with gr.Tab("① Initialize Object", id="init"): gr.Markdown(INIT_HELP) with gr.Row(): with gr.Column(scale=1): init_object_id = gr.Textbox( label="Object ID", placeholder="e.g., target_cube", value="target_cube", ) init_mesh = gr.File( label="3D Mesh (.obj / .stl / .ply)", file_count="single", file_types=[".obj", ".stl", ".ply", ".mesh"], ) init_refs = gr.File( label="Reference Images (optional)", file_count="multiple", file_types=["image"], ) gr.Markdown("#### Camera Intrinsics") with gr.Row(): init_fx = gr.Number(label="fx", value=193.137, precision=3) init_fy = gr.Number(label="fy", value=193.137, precision=3) with gr.Row(): init_cx = gr.Number(label="cx", value=120.0, precision=1) init_cy = gr.Number(label="cy", value=80.0, precision=1) init_btn = gr.Button("🚀 Initialize Object", variant="primary", size="lg") with gr.Column(scale=1): init_result = gr.Textbox( label="Result", lines=6, interactive=False, elem_classes=["pose-output"], ) init_btn.click( fn=initialize_object, inputs=[init_object_id, init_mesh, init_refs, init_fx, init_fy, init_cx, init_cy], outputs=init_result, ) # ── Tab 2: Estimate Pose ────────────────────────────── with gr.Tab("② Estimate Pose", id="estimate"): gr.Markdown(ESTIMATE_HELP) with gr.Row(): with gr.Column(scale=1): est_object_id = gr.Textbox( label="Object ID", placeholder="Must match initialization", value="target_cube", ) est_query = gr.Image( label="Query Image (RGB)", type="numpy", ) est_depth = gr.Image( label="Depth Image (optional, 16-bit PNG)", type="numpy", ) est_mask = gr.Radio( choices=["SlimSAM", "Otsu"], value="SlimSAM", label="Mask Method", ) gr.Markdown("#### Camera Intrinsics") with gr.Row(): est_fx = gr.Number(label="fx", value=193.137, precision=3) est_fy = gr.Number(label="fy", value=193.137, precision=3) with gr.Row(): est_cx = gr.Number(label="cx", value=120.0, precision=1) est_cy = gr.Number(label="cy", value=80.0, precision=1) est_btn = gr.Button("🎯 Estimate Pose", variant="primary", size="lg") with gr.Column(scale=1): est_viz = gr.Image( label="Pose Visualization (axes + bounding box)", type="numpy", ) est_result = gr.Textbox( label="Pose Output", lines=18, interactive=False, elem_classes=["pose-output"], ) est_btn.click( fn=estimate_pose, inputs=[est_object_id, est_query, est_depth, est_fx, est_fy, est_cx, est_cy, est_mask], outputs=[est_result, est_viz], ) # ── Tab 3: About ────────────────────────────────────── with gr.Tab("ℹ️ About"): gr.Markdown(""" ## Architecture FoundationPose uses a **two-stage pipeline**: 1. **Pose Hypothesis Generation**: Generates 42 coarse pose hypotheses from uniformly sampled viewpoints 2. **Transformer-based Refinement**: A ResNet-34 backbone with 4-head attention refines each hypothesis 3. **Contrastive Ranking**: InfoNCE loss ranks hypotheses, selecting the best pose ### Training Data - **600K synthetic scenes** rendered on Objaverse objects with LLM-aided texture augmentation - **1.2M training images** — no real-world training data needed ### Two Modes - **Model-Based**: Uses a CAD mesh for precise render-and-compare - **Model-Free**: Reconstructs a NeRF from 16-20 reference images ## BOP Challenge 2024 Results | Dataset | AR Score | |---------|----------| | LM-O | 75.6 | | T-LESS | 64.6 | | TUD-L | 92.3 | | IC-BIN | 50.8 | | ITODD | 58.0 | | HB | 83.5 | | YCB-V | 88.9 | | **Average** | **73.4** | ## Citation ```bibtex @inproceedings{wen2024foundationpose, title={FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects}, author={Wen, Bowen and Yang, Wei and Kautz, Jan and Birchfield, Stan}, booktitle={CVPR}, year={2024} } ``` ## Links - [Paper (arXiv)](https://arxiv.org/abs/2312.08344) - [Project Page](https://nvlabs.github.io/FoundationPose/) - [GitHub](https://github.com/NVlabs/FoundationPose) - [Model Weights (HF Hub)](https://huggingface.co/gpue/foundationpose-weights) - [Backend Space](https://huggingface.co/spaces/gpue/foundationpose) - [BOP Challenge](https://bop.felk.cvut.cz/) """) gr.Markdown(""" ---
Built with ❤️ using [FoundationPose](https://nvlabs.github.io/FoundationPose/) by NVIDIA and [Gradio](https://gradio.app) — Powered by the [FoundationPose backend Space](https://huggingface.co/spaces/gpue/foundationpose)
""") return demo if __name__ == "__main__": demo = build_ui() demo.launch(server_name="0.0.0.0", server_port=7860)