Spaces:

hedonismbot24
/

foundationpose-6d-pose-estimation

Running

App Files Files Community

foundationpose-6d-pose-estimation / app.py

hedonismbot24

Add FoundationPose 6D pose estimation demo app

69016ef verified about 1 month ago

raw

history blame contribute delete

23.4 kB

	"""
	FoundationPose 6D Object Pose Estimation Demo

	A polished Gradio interface for NVIDIA FoundationPose — the #1 method on the
	BOP Challenge 2024 benchmark for model-based 6D object localization of unseen objects.

	This app connects to a FoundationPose inference backend and provides:
	- CAD-based (model-based) initialization with a 3D mesh
	- Automatic object masking via SlimSAM
	- 6D pose estimation (position + orientation)
	- 3D pose visualization overlaid on the image
	"""

	import io
	import logging
	import math
	import tempfile
	import time
	from pathlib import Path
	from typing import Dict, List, Optional, Tuple

	import cv2
	import gradio as gr
	import numpy as np
	from PIL import Image

	logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s: %(message)s")
	logger = logging.getLogger(__name__)

	# ── Backend connection ──────────────────────────────────────────────────────
	BACKEND_URL = "https://gpue-foundationpose.hf.space"
	_gradio_client = None


	def _get_client():
	"""Lazy-load Gradio client to the FoundationPose backend."""
	global _gradio_client
	if _gradio_client is None:
	from gradio_client import Client
	logger.info(f"Connecting to FoundationPose backend at {BACKEND_URL}...")
	_gradio_client = Client(BACKEND_URL)
	logger.info("Connected.")
	return _gradio_client


	# ── Pose visualization ──────────────────────────────────────────────────────

	def draw_pose_axes(
	image: np.ndarray,
	pose_matrix: np.ndarray,
	K: np.ndarray,
	axis_length: float = 0.05,
	thickness: int = 3,
	) -> np.ndarray:
	"""Draw 3D coordinate axes on the image from a 4x4 pose matrix.

	Red = X, Green = Y, Blue = Z.
	"""
	vis = image.copy()
	R = pose_matrix[:3, :3]
	t = pose_matrix[:3, 3]

	# Origin and axis endpoints in 3D
	origin = t.reshape(3, 1)
	axes_3d = origin + R @ (np.eye(3) * axis_length) # (3, 3)

	# Project to 2D
	def project(pt3d):
	p = (K @ pt3d).flatten()
	if abs(p[2]) < 1e-6:
	return None
	return int(p[0] / p[2]), int(p[1] / p[2])

	o2d = project(origin)
	if o2d is None:
	return vis

	colors = [(0, 0, 255), (0, 255, 0), (255, 0, 0)] # R, G, B for X, Y, Z
	labels = ["X", "Y", "Z"]

	for i in range(3):
	end = project(axes_3d[:, i:i + 1])
	if end is None:
	continue
	cv2.arrowedLine(vis, o2d, end, colors[i], thickness, tipLength=0.2)
	cv2.putText(vis, labels[i], (end[0] + 5, end[1] - 5),
	cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[i], 2)

	# Draw origin circle
	cv2.circle(vis, o2d, 5, (255, 255, 255), -1)
	cv2.circle(vis, o2d, 5, (0, 0, 0), 2)

	return vis


	def draw_bounding_box_from_pose(
	image: np.ndarray,
	pose_matrix: np.ndarray,
	K: np.ndarray,
	size: float = 0.03,
	) -> np.ndarray:
	"""Draw a projected 3D bounding box around the object."""
	vis = image.copy()
	R = pose_matrix[:3, :3]
	t = pose_matrix[:3, 3]

	# 8 corners of a cube centered at origin
	corners = np.array([
	[-1, -1, -1], [1, -1, -1], [1, 1, -1], [-1, 1, -1],
	[-1, -1, 1], [1, -1, 1], [1, 1, 1], [-1, 1, 1],
	], dtype=np.float64) * size

	# Transform to camera frame
	corners_cam = (R @ corners.T + t.reshape(3, 1)).T # (8, 3)

	# Project
	def project(pt3d):
	p = (K @ pt3d.reshape(3, 1)).flatten()
	if abs(p[2]) < 1e-6:
	return None
	return int(p[0] / p[2]), int(p[1] / p[2])

	pts_2d = [project(c) for c in corners_cam]
	if any(p is None for p in pts_2d):
	return vis

	# Draw edges
	edges = [
	(0, 1), (1, 2), (2, 3), (3, 0), # back face
	(4, 5), (5, 6), (6, 7), (7, 4), # front face
	(0, 4), (1, 5), (2, 6), (3, 7), # connecting edges
	]
	for i, j in edges:
	cv2.line(vis, pts_2d[i], pts_2d[j], (0, 255, 255), 2)

	return vis


	def quat_to_euler(w, x, y, z) -> Tuple[float, float, float]:
	"""Convert quaternion to Euler angles (roll, pitch, yaw) in degrees."""
	# Roll (X-axis rotation)
	sinr_cosp = 2 * (w * x + y * z)
	cosr_cosp = 1 - 2 * (x * x + y * y)
	roll = math.atan2(sinr_cosp, cosr_cosp)

	# Pitch (Y-axis rotation)
	sinp = 2 * (w * y - z * x)
	if abs(sinp) >= 1:
	pitch = math.copysign(math.pi / 2, sinp)
	else:
	pitch = math.asin(sinp)

	# Yaw (Z-axis rotation)
	siny_cosp = 2 * (w * z + x * y)
	cosy_cosp = 1 - 2 * (y * y + z * z)
	yaw = math.atan2(siny_cosp, cosy_cosp)

	return math.degrees(roll), math.degrees(pitch), math.degrees(yaw)


	def format_pose_result(pose: Dict) -> str:
	"""Format a pose result into a readable string."""
	lines = []
	pos = pose.get("position", {})
	ori = pose.get("orientation", {})

	lines.append("━━━ Position (meters) ━━━")
	lines.append(f" X: {pos.get('x', 0):+.4f}")
	lines.append(f" Y: {pos.get('y', 0):+.4f}")
	lines.append(f" Z: {pos.get('z', 0):+.4f}")

	lines.append("")
	lines.append("━━━ Orientation (quaternion) ━━━")
	lines.append(f" W: {ori.get('w', 0):+.6f}")
	lines.append(f" X: {ori.get('x', 0):+.6f}")
	lines.append(f" Y: {ori.get('y', 0):+.6f}")
	lines.append(f" Z: {ori.get('z', 0):+.6f}")

	# Euler angles
	roll, pitch, yaw = quat_to_euler(
	ori.get('w', 1), ori.get('x', 0),
	ori.get('y', 0), ori.get('z', 0)
	)
	lines.append("")
	lines.append("━━━ Euler Angles (degrees) ━━━")
	lines.append(f" Roll: {roll:+.2f}°")
	lines.append(f" Pitch: {pitch:+.2f}°")
	lines.append(f" Yaw: {yaw:+.2f}°")

	if "confidence" in pose:
	lines.append("")
	lines.append(f"━━━ Confidence: {pose['confidence']:.2%} ━━━")

	return "\n".join(lines)


	# ── Core API functions ───────────────────────────────────────────────────────

	def initialize_object(
	object_id: str,
	mesh_file,
	reference_files: List,
	fx: float,
	fy: float,
	cx: float,
	cy: float,
	):
	"""Initialize an object with a CAD mesh + optional reference images."""
	if not object_id:
	return "❌ Please provide an Object ID"
	if not mesh_file:
	return "❌ Please upload a 3D mesh file (.obj, .stl, .ply)"

	try:
	from gradio_client import handle_file

	client = _get_client()

	# Prepare reference files
	ref_handles = []
	if reference_files:
	for f in reference_files:
	if hasattr(f, 'name'):
	ref_handles.append(handle_file(f.name))
	elif isinstance(f, str):
	ref_handles.append(handle_file(f))

	mesh_handle = handle_file(mesh_file.name) if hasattr(mesh_file, 'name') else handle_file(mesh_file)

	result = client.predict(
	object_id,
	mesh_handle,
	ref_handles if ref_handles else None,
	fx, fy, cx, cy,
	api_name="/gradio_initialize_cad",
	)

	return f"✅ {result}"

	except Exception as e:
	logger.error(f"Initialization error: {e}", exc_info=True)
	return f"❌ Error: {str(e)}"


	def estimate_pose(
	object_id: str,
	query_image: np.ndarray,
	depth_image: Optional[np.ndarray],
	fx: float,
	fy: float,
	cx: float,
	cy: float,
	mask_method: str,
	):
	"""Estimate 6D pose and return visualization."""
	if query_image is None:
	return "❌ Please upload a query image", None

	if not object_id:
	return "❌ Please provide the Object ID (must match initialization)", None

	try:
	from gradio_client import handle_file

	client = _get_client()

	# Save query image to temp file
	with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
	Image.fromarray(query_image).save(f.name)
	query_path = f.name

	# Save depth image if provided
	depth_path = None
	if depth_image is not None:
	with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
	Image.fromarray(depth_image).save(f.name)
	depth_path = f.name

	# Call the backend
	result = client.predict(
	object_id,
	handle_file(query_path),
	handle_file(depth_path) if depth_path else None,
	fx, fy, cx, cy,
	mask_method,
	None, # mask_editor_data
	api_name="/gradio_estimate",
	)

	# Parse result — the backend returns (text, image_path, mask_path)
	if isinstance(result, (list, tuple)):
	text_result = result[0] if len(result) > 0 else ""
	viz_path = result[1] if len(result) > 1 else None
	mask_path = result[2] if len(result) > 2 else None
	else:
	text_result = str(result)
	viz_path = None
	mask_path = None

	# Build visualization
	vis_image = query_image.copy()

	# Try to parse pose from the text result
	pose_info = _parse_pose_text(text_result)

	if pose_info:
	K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float64)

	# If we have the full pose matrix, visualize it
	if "pose_matrix" in pose_info:
	pose_mat = np.array(pose_info["pose_matrix"])
	vis_image = draw_pose_axes(vis_image, pose_mat, K, axis_length=0.05)
	vis_image = draw_bounding_box_from_pose(vis_image, pose_mat, K, size=0.03)
	elif "position" in pose_info:
	# Build pose matrix from position + quaternion
	pos = pose_info["position"]
	ori = pose_info.get("orientation", {"w": 1, "x": 0, "y": 0, "z": 0})
	pose_mat = _quat_pos_to_matrix(pos, ori)
	vis_image = draw_pose_axes(vis_image, pose_mat, K, axis_length=0.05)
	vis_image = draw_bounding_box_from_pose(vis_image, pose_mat, K, size=0.03)

	formatted = format_pose_result(pose_info)
	return f"✅ Pose Estimated Successfully\n\n{formatted}", vis_image
	else:
	# Return raw result from backend
	return text_result, vis_image

	except Exception as e:
	logger.error(f"Pose estimation error: {e}", exc_info=True)
	return f"❌ Error: {str(e)}", query_image


	def _parse_pose_text(text: str) -> Optional[Dict]:
	"""Parse pose information from the backend's text output."""
	if not text or "No poses" in text or "failed" in text.lower() or "error" in text.lower():
	return None

	pose = {}
	lines = text.strip().split("\n")

	position = {}
	orientation = {}
	in_position = False
	in_orientation = False

	for line in lines:
	line = line.strip()
	if "Position:" in line:
	in_position = True
	in_orientation = False
	continue
	if "Orientation" in line:
	in_position = False
	in_orientation = True
	continue
	if "Confidence" in line:
	in_position = False
	in_orientation = False
	try:
	val = line.split(":")[-1].strip().rstrip("%")
	pose["confidence"] = float(val) / 100 if "%" in line else float(val)
	except (ValueError, IndexError):
	pass
	continue

	if in_position:
	if "x:" in line:
	try:
	position["x"] = float(line.split(":")[-1].strip().split()[0])
	except (ValueError, IndexError):
	pass
	elif "y:" in line:
	try:
	position["y"] = float(line.split(":")[-1].strip().split()[0])
	except (ValueError, IndexError):
	pass
	elif "z:" in line:
	try:
	position["z"] = float(line.split(":")[-1].strip().split()[0])
	except (ValueError, IndexError):
	pass

	if in_orientation:
	if "w:" in line:
	try:
	orientation["w"] = float(line.split(":")[-1].strip())
	except (ValueError, IndexError):
	pass
	elif "x:" in line:
	try:
	orientation["x"] = float(line.split(":")[-1].strip())
	except (ValueError, IndexError):
	pass
	elif "y:" in line:
	try:
	orientation["y"] = float(line.split(":")[-1].strip())
	except (ValueError, IndexError):
	pass
	elif "z:" in line:
	try:
	orientation["z"] = float(line.split(":")[-1].strip())
	except (ValueError, IndexError):
	pass

	if position:
	pose["position"] = position
	if orientation:
	pose["orientation"] = orientation

	return pose if pose else None


	def _quat_pos_to_matrix(pos: Dict, ori: Dict) -> np.ndarray:
	"""Convert position + quaternion to a 4x4 transformation matrix."""
	w, x, y, z = ori.get("w", 1), ori.get("x", 0), ori.get("y", 0), ori.get("z", 0)

	# Rotation matrix from quaternion
	R = np.array([
	[1 - 2(yy + zz), 2(xy - wz), 2(xz + w*y)],
	[2(xy + wz), 1 - 2(xx + zz), 2(yz - w*x)],
	[2(xz - wy), 2(yz + wx), 1 - 2(xx + y*y)],
	], dtype=np.float64)

	T = np.eye(4, dtype=np.float64)
	T[:3, :3] = R
	T[0, 3] = pos.get("x", 0)
	T[1, 3] = pos.get("y", 0)
	T[2, 3] = pos.get("z", 0)
	return T


	# ── Gradio UI ────────────────────────────────────────────────────────────────

	DESCRIPTION = """
	# 🎯 FoundationPose — 6D Object Pose Estimation

	[FoundationPose](https://nvlabs.github.io/FoundationPose/) by NVIDIA is the #1 method on the
	[BOP Challenge 2024](https://bop.felk.cvut.cz/) benchmark for model-based 6D localization of unseen objects,
	achieving an AR score of 73.4 across 7 core datasets (LM-O, T-LESS, TUD-L, IC-BIN, ITODD, HB, YCB-V).

	### How it works
	1. Initialize: Upload a 3D mesh (.obj/.stl/.ply) of your object and optionally reference RGB images
	2. Estimate: Upload a query RGB image (+ optional depth) and the model estimates the full 6D pose
	3. Visualize: See the projected 3D axes and bounding box overlaid on the image

	The pose output is a 4×4 transformation matrix (rotation + translation) from object frame to camera frame.

	\| Metric \| Value \|
	\|--------\|-------\|
	\| BOP AR Score \| 73.4 \|
	\| BOP Rank (2024) \| #1 (model-based unseen) \|
	\| Paper \| [CVPR 2024](https://arxiv.org/abs/2312.08344) \|
	\| Input \| RGB-D + CAD mesh \|
	"""

	INIT_HELP = """
	### 📋 Initialization Guide

	Required:
	- Object ID: A unique name for your object (e.g., "mug", "wrench")
	- 3D Mesh: Upload an `.obj`, `.stl`, or `.ply` file of the object

	Optional but recommended:
	- Reference Images: 1+ RGB images of the object from known viewpoints
	- Camera Intrinsics: Focal lengths (fx, fy) and principal point (cx, cy)

	> 💡 Tip: The default intrinsics work for the bundled test data. For your own images,
	> use the calibration values from your camera.
	"""

	ESTIMATE_HELP = """
	### 📋 Estimation Guide

	- Query Image: An RGB image containing the initialized object
	- Depth Image: Optional 16-bit depth map (improves accuracy significantly)
	- Mask Method:
	- `SlimSAM` — automatic segmentation (recommended)
	- `Otsu` — simple brightness-based thresholding

	> ⚠️ Important: Camera intrinsics must match the query image resolution.
	> If you resize the image, scale fx/fy/cx/cy proportionally.
	"""


	def build_ui():
	with gr.Blocks(
	title="FoundationPose 6D Pose Estimation",
	theme=gr.themes.Soft(),
	css="""
	.pose-output { font-family: monospace; }
	.gr-button-primary { background: #6366f1 !important; }
	""",
	) as demo:
	gr.Markdown(DESCRIPTION)

	with gr.Tabs():
	# ── Tab 1: Initialize ─────────────────────────────────
	with gr.Tab("① Initialize Object", id="init"):
	gr.Markdown(INIT_HELP)

	with gr.Row():
	with gr.Column(scale=1):
	init_object_id = gr.Textbox(
	label="Object ID",
	placeholder="e.g., target_cube",
	value="target_cube",
	)
	init_mesh = gr.File(
	label="3D Mesh (.obj / .stl / .ply)",
	file_count="single",
	file_types=[".obj", ".stl", ".ply", ".mesh"],
	)
	init_refs = gr.File(
	label="Reference Images (optional)",
	file_count="multiple",
	file_types=["image"],
	)

	gr.Markdown("#### Camera Intrinsics")
	with gr.Row():
	init_fx = gr.Number(label="fx", value=193.137, precision=3)
	init_fy = gr.Number(label="fy", value=193.137, precision=3)
	with gr.Row():
	init_cx = gr.Number(label="cx", value=120.0, precision=1)
	init_cy = gr.Number(label="cy", value=80.0, precision=1)

	init_btn = gr.Button("🚀 Initialize Object", variant="primary", size="lg")

	with gr.Column(scale=1):
	init_result = gr.Textbox(
	label="Result",
	lines=6,
	interactive=False,
	elem_classes=["pose-output"],
	)

	init_btn.click(
	fn=initialize_object,
	inputs=[init_object_id, init_mesh, init_refs, init_fx, init_fy, init_cx, init_cy],
	outputs=init_result,
	)

	# ── Tab 2: Estimate Pose ──────────────────────────────
	with gr.Tab("② Estimate Pose", id="estimate"):
	gr.Markdown(ESTIMATE_HELP)

	with gr.Row():
	with gr.Column(scale=1):
	est_object_id = gr.Textbox(
	label="Object ID",
	placeholder="Must match initialization",
	value="target_cube",
	)
	est_query = gr.Image(
	label="Query Image (RGB)",
	type="numpy",
	)
	est_depth = gr.Image(
	label="Depth Image (optional, 16-bit PNG)",
	type="numpy",
	)
	est_mask = gr.Radio(
	choices=["SlimSAM", "Otsu"],
	value="SlimSAM",
	label="Mask Method",
	)

	gr.Markdown("#### Camera Intrinsics")
	with gr.Row():
	est_fx = gr.Number(label="fx", value=193.137, precision=3)
	est_fy = gr.Number(label="fy", value=193.137, precision=3)
	with gr.Row():
	est_cx = gr.Number(label="cx", value=120.0, precision=1)
	est_cy = gr.Number(label="cy", value=80.0, precision=1)

	est_btn = gr.Button("🎯 Estimate Pose", variant="primary", size="lg")

	with gr.Column(scale=1):
	est_viz = gr.Image(
	label="Pose Visualization (axes + bounding box)",
	type="numpy",
	)
	est_result = gr.Textbox(
	label="Pose Output",
	lines=18,
	interactive=False,
	elem_classes=["pose-output"],
	)

	est_btn.click(
	fn=estimate_pose,
	inputs=[est_object_id, est_query, est_depth, est_fx, est_fy, est_cx, est_cy, est_mask],
	outputs=[est_result, est_viz],
	)

	# ── Tab 3: About ──────────────────────────────────────
	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	## Architecture

	FoundationPose uses a two-stage pipeline:

	1. Pose Hypothesis Generation: Generates 42 coarse pose hypotheses from uniformly sampled viewpoints
	2. Transformer-based Refinement: A ResNet-34 backbone with 4-head attention refines each hypothesis
	3. Contrastive Ranking: InfoNCE loss ranks hypotheses, selecting the best pose

	### Training Data
	- 600K synthetic scenes rendered on Objaverse objects with LLM-aided texture augmentation
	- 1.2M training images — no real-world training data needed

	### Two Modes
	- Model-Based: Uses a CAD mesh for precise render-and-compare
	- Model-Free: Reconstructs a NeRF from 16-20 reference images

	## BOP Challenge 2024 Results

	\| Dataset \| AR Score \|
	\|---------\|----------\|
	\| LM-O \| 75.6 \|
	\| T-LESS \| 64.6 \|
	\| TUD-L \| 92.3 \|
	\| IC-BIN \| 50.8 \|
	\| ITODD \| 58.0 \|
	\| HB \| 83.5 \|
	\| YCB-V \| 88.9 \|
	\| Average \| 73.4 \|

	## Citation

	```bibtex
	@inproceedings{wen2024foundationpose,
	title={FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects},
	author={Wen, Bowen and Yang, Wei and Kautz, Jan and Birchfield, Stan},
	booktitle={CVPR},
	year={2024}
	}
	```

	## Links
	- [Paper (arXiv)](https://arxiv.org/abs/2312.08344)
	- [Project Page](https://nvlabs.github.io/FoundationPose/)
	- [GitHub](https://github.com/NVlabs/FoundationPose)
	- [Model Weights (HF Hub)](https://huggingface.co/gpue/foundationpose-weights)
	- [Backend Space](https://huggingface.co/spaces/gpue/foundationpose)
	- [BOP Challenge](https://bop.felk.cvut.cz/)
	""")

	gr.Markdown("""
	---
	<center>

	Built with ❤️ using [FoundationPose](https://nvlabs.github.io/FoundationPose/) by NVIDIA
	and [Gradio](https://gradio.app) — Powered by the
	[FoundationPose backend Space](https://huggingface.co/spaces/gpue/foundationpose)

	</center>
	""")

	return demo


	if __name__ == "__main__":
	demo = build_ui()
	demo.launch(server_name="0.0.0.0", server_port=7860)