Spaces:

gpue
/

foundationpose

Sleeping

Georg commited on Jan 28

Commit

703d3c2

1 Parent(s): e15abf5

Fix API endpoints with FastAPI integration

- Replace Gradio-only app with FastAPI + Gradio hybrid
- Add /api/initialize and /api/estimate REST endpoints using FastAPI
- Add FastAPI, uvicorn, pydantic dependencies
- Keep Gradio UI for web interface
- Properly expose REST API for robot-ml training integration

Files changed (4) hide show

.gitignore +1 -0
app.py +60 -338
app_simple.py +290 -0
requirements.txt +5 -0

.gitignore CHANGED Viewed

@@ -44,3 +44,4 @@ flagged/
 # Test images
 test_images/
 reference_images/

 # Test images
 test_images/
 reference_images/
+app_old.py

app.py CHANGED Viewed

@@ -1,28 +1,22 @@
 """
-FoundationPose Inference Server with ZeroGPU Support
-This Gradio app provides an API for 6D object pose estimation using FoundationPose.
-It's designed to be called from the robot-ml training pipeline via HTTP requests.
-API Endpoints:
-- /api/initialize: Set up tracking for an object with reference images
-- /api/estimate: Estimate 6D pose from a query image
 """
 import base64
-import io
 import json
 import logging
 import os
-from pathlib import Path
-from typing import Dict, List, Optional
 import cv2
 import gradio as gr
 import numpy as np
 import spaces
 import torch
-from PIL import Image
 logging.basicConfig(
     level=logging.INFO,
@@ -31,7 +25,7 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 # Check if running in real FoundationPose mode or placeholder mode
-USE_REAL_MODEL = os.environ.get("USE_REAL_MODEL", "false").lower() ==  "true"
 class FoundationPoseInference:
@@ -82,27 +76,16 @@ class FoundationPoseInference:
         self,
         object_id: str,
         reference_images: List[np.ndarray],
-        camera_intrinsics: Optional[Dict] = None,
-        mesh_path: Optional[str] = None
     ) -> bool:
-        """Register an object for tracking with reference images.
-        Args:
-            object_id: Unique identifier for the object
-            reference_images: List of RGB images (numpy arrays) showing the object from different angles
-            camera_intrinsics: Camera parameters (fx, fy, cx, cy)
-            mesh_path: Optional path to CAD mesh file
-        Returns:
-            True if registration successful
-        """
         if not self.initialized:
             self.initialize_model()
         logger.info(f"Registering object '{object_id}' with {len(reference_images)} reference images")
         if self.use_real_model and self.model is not None:
-            # Use real FoundationPose model
             try:
                 success = self.model.register_object(
                     object_id=object_id,
@@ -121,7 +104,6 @@ class FoundationPoseInference:
                 logger.error(f"Registration failed: {e}", exc_info=True)
                 return False
         else:
-            # Placeholder mode
             self.tracked_objects[object_id] = {
                 "num_references": len(reference_images),
                 "camera_intrinsics": camera_intrinsics,
@@ -130,39 +112,16 @@ class FoundationPoseInference:
             logger.info(f"✓ Object '{object_id}' registered (placeholder mode)")
             return True
-    @spaces.GPU(duration=10)  # Allocate GPU for 10 seconds per inference
     def estimate_pose(
         self,
         object_id: str,
         query_image: np.ndarray,
-        camera_intrinsics: Optional[Dict] = None,
-        depth_image: Optional[np.ndarray] = None,
-        mask: Optional[np.ndarray] = None
     ) -> Dict:
-        """Estimate 6D pose of an object in a query image.
-        Args:
-            object_id: ID of object to detect
-            query_image: RGB query image as numpy array
-            camera_intrinsics: Optional camera parameters
-            depth_image: Optional depth map
-            mask: Optional object segmentation mask
-        Returns:
-            Dictionary with pose estimation results:
-            {
-                "success": bool,
-                "poses": [
-                    {
-                        "object_id": str,
-                        "position": {"x": float, "y": float, "z": float},
-                        "orientation": {"w": float, "x": float, "y": float, "z": float},
-                        "confidence": float,
-                        "dimensions": [float, float, float]
-                    }
-                ]
-            }
-        """
         if not self.initialized:
             return {"success": False, "error": "Model not initialized"}
@@ -172,7 +131,6 @@ class FoundationPoseInference:
         logger.info(f"Estimating pose for object '{object_id}'")
         if self.use_real_model and self.model is not None:
-            # Use real FoundationPose model
             try:
                 pose_result = self.model.estimate_pose(
                     object_id=object_id,
@@ -198,7 +156,6 @@ class FoundationPoseInference:
                 logger.error(f"Pose estimation error: {e}", exc_info=True)
                 return {"success": False, "error": str(e), "poses": []}
         else:
-            # Placeholder mode - return empty poses
             logger.info("Placeholder mode: returning empty pose result")
             return {
                 "success": True,
@@ -211,37 +168,33 @@ class FoundationPoseInference:
 pose_estimator = FoundationPoseInference()
-def initialize_api(request: gr.Request) -> Dict:
-    """API endpoint for initializing object tracking.
-    Request body:
-    {
-        "object_id": str,
-        "reference_images_b64": [str, ...],
-        "camera_intrinsics": str (JSON),
-        "mesh_path": str (optional)
-    }
-    Returns:
-        {"success": bool, "message": str}
-    """
-    try:
-        data = request.json() if hasattr(request, 'json') else {}
-        object_id = data.get("object_id")
-        reference_images_b64 = data.get("reference_images_b64", [])
-        camera_intrinsics_str = data.get("camera_intrinsics")
-        mesh_path = data.get("mesh_path")
-        if not object_id:
-            return {"success": False, "error": "Missing object_id"}
-        if not reference_images_b64:
-            return {"success": False, "error": "Missing reference_images_b64"}
         # Decode reference images
         reference_images = []
-        for img_b64 in reference_images_b64:
             img_bytes = base64.b64decode(img_b64)
             img_array = np.frombuffer(img_bytes, dtype=np.uint8)
             img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
@@ -249,81 +202,55 @@ def initialize_api(request: gr.Request) -> Dict:
             reference_images.append(img)
         # Parse camera intrinsics
-        intrinsics = json.loads(camera_intrinsics_str) if camera_intrinsics_str else None
         # Register object
         success = pose_estimator.register_object(
-            object_id=object_id,
             reference_images=reference_images,
             camera_intrinsics=intrinsics,
-            mesh_path=mesh_path
         )
         return {
             "success": success,
-            "message": f"Object '{object_id}' registered with {len(reference_images)} reference images"
         }
     except Exception as e:
         logger.error(f"Initialization error: {e}", exc_info=True)
-        return {"success": False, "error": str(e)}
-def estimate_api(request: gr.Request) -> Dict:
-    """API endpoint for pose estimation.
-    Request body:
-    {
-        "object_id": str,
-        "query_image_b64": str,
-        "camera_intrinsics": str (JSON),
-        "depth_image_b64": str (optional),
-        "mask_b64": str (optional)
-    }
-    Returns:
-        Pose estimation results
-    """
     try:
-        data = request.json() if hasattr(request, 'json') else {}
-        object_id = data.get("object_id")
-        query_image_b64 = data.get("query_image_b64")
-        camera_intrinsics_str = data.get("camera_intrinsics")
-        depth_image_b64 = data.get("depth_image_b64")
-        mask_b64 = data.get("mask_b64")
-        if not object_id:
-            return {"success": False, "error": "Missing object_id"}
-        if not query_image_b64:
-            return {"success": False, "error": "Missing query_image_b64"}
         # Decode query image
-        img_bytes = base64.b64decode(query_image_b64)
         img_array = np.frombuffer(img_bytes, dtype=np.uint8)
         img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
         # Decode optional depth image
         depth = None
-        if depth_image_b64:
-            depth_bytes = base64.b64decode(depth_image_b64)
             depth = np.frombuffer(depth_bytes, dtype=np.float32)
         # Decode optional mask
         mask = None
-        if mask_b64:
-            mask_bytes = base64.b64decode(mask_b64)
             mask_array = np.frombuffer(mask_bytes, dtype=np.uint8)
             mask = cv2.imdecode(mask_array, cv2.IMREAD_GRAYSCALE)
         # Parse camera intrinsics
-        intrinsics = json.loads(camera_intrinsics_str) if camera_intrinsics_str else None
         # Estimate pose
         result = pose_estimator.estimate_pose(
-            object_id=object_id,
             query_image=img,
             camera_intrinsics=intrinsics,
             depth_image=depth,
@@ -334,235 +261,30 @@ def estimate_api(request: gr.Request) -> Dict:
     except Exception as e:
         logger.error(f"Estimation error: {e}", exc_info=True)
-        return {"success": False, "error": str(e)}
-# Gradio UI for testing
-def test_initialization(object_id: str, reference_images: List):
-    """Test UI for initialization."""
-    if not object_id:
-        return "❌ Please enter an object ID"
-    if not reference_images:
-        return "❌ Please upload reference images"
-    try:
-        # Convert PIL images to numpy arrays
-        ref_imgs = []
-        for img in reference_images:
-            ref_imgs.append(np.array(img))
-        success = pose_estimator.register_object(object_id, ref_imgs, None)
-        if success:
-            return f"✅ Object '{object_id}' registered with {len(ref_imgs)} images"
-        else:
-            return "❌ Registration failed"
-    except Exception as e:
-        logger.error(f"Test initialization error: {e}", exc_info=True)
-        return f"❌ Error: {str(e)}"
-def test_estimation(object_id: str, query_image):
-    """Test UI for pose estimation."""
-    if not object_id:
-        return "❌ Please enter an object ID", None
-    if query_image is None:
-        return "❌ Please upload a query image", None
-    try:
-        query_img = np.array(query_image)
-        result = pose_estimator.estimate_pose(object_id, query_img, None)
-        if result["success"]:
-            num_poses = len(result["poses"])
-            output_text = f"✅ Detection complete: {num_poses} pose(s) detected\n\n"
-            if num_poses == 0:
-                output_text += "Note: " + result.get("note", "No poses detected")
-            else:
-                for i, pose in enumerate(result["poses"]):
-                    output_text += f"Pose {i+1}:\n"
-                    output_text += f"  Position: ({pose['position']['x']:.3f}, {pose['position']['y']:.3f}, {pose['position']['z']:.3f})\n"
-                    output_text += f"  Confidence: {pose['confidence']:.3f}\n\n"
-            # TODO: Visualize detected pose on image
-            output_image = query_image
-            return output_text, output_image
-        else:
-            return f"❌ Detection failed: {result.get('error', 'Unknown error')}", None
-    except Exception as e:
-        logger.error(f"Test estimation error: {e}", exc_info=True)
-        return f"❌ Error: {str(e)}", None
-# Build Gradio interface
-with gr.Blocks(title="FoundationPose Inference", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎯 FoundationPose 6D Object Pose Estimation")
     mode_indicator = gr.Markdown(
-        f"**Mode:** {'🟢 Real FoundationPose' if USE_REAL_MODEL else '🟡 Placeholder (set USE_REAL_MODEL=true)'}",
         elem_id="mode"
     )
     gr.Markdown("""
-    This service provides 6D object pose estimation using FoundationPose.
-    **Usage:**
-    1. Register an object with reference images using the Initialize tab
-    2. Estimate poses in query images using the Estimate tab
-    **API Endpoints:**
-    - POST `/api/initialize` - Register object with reference images
-    - POST `/api/estimate` - Estimate 6D pose from query image
     """)
-    with gr.Tab("🔧 Initialize Object"):
-        gr.Markdown("### Register an object for tracking")
-        with gr.Row():
-            with gr.Column():
-                init_object_id = gr.Textbox(
-                    label="Object ID",
-                    placeholder="e.g., target_cube",
-                    info="Unique identifier for the object"
-                )
-                init_ref_images = gr.File(
-                    label="Reference Images (16-20 recommended)",
-                    file_count="multiple",
-                    file_types=["image"],
-                    type="filepath"
-                )
-                init_button = gr.Button("Register Object", variant="primary", size="lg")
-            with gr.Column():
-                init_output = gr.Textbox(label="Result", lines=8)
-        gr.Markdown("""
-        **Tips:**
-        - Capture 16-20 images from different viewpoints
-        - Include various angles and distances
-        - Ensure good lighting and sharp focus
-        """)
-        init_button.click(
-            fn=test_initialization,
-            inputs=[init_object_id, init_ref_images],
-            outputs=init_output
-        )
-    with gr.Tab("🔍 Estimate Pose"):
-        gr.Markdown("### Detect object pose in a query image")
-        with gr.Row():
-            with gr.Column():
-                est_object_id = gr.Textbox(
-                    label="Object ID",
-                    placeholder="e.g., target_cube",
-                    info="Must match an initialized object"
-                )
-                est_query_image = gr.Image(
-                    label="Query Image",
-                    type="pil",
-                    sources=["upload", "webcam"]
-                )
-                est_button = gr.Button("Estimate Pose", variant="primary", size="lg")
-            with gr.Column():
-                est_output_text = gr.Textbox(label="Detection Results", lines=15)
-                est_output_image = gr.Image(label="Visualization (coming soon)")
-        est_button.click(
-            fn=test_estimation,
-            inputs=[est_object_id, est_query_image],
-            outputs=[est_output_text, est_output_image]
-        )
-    with gr.Tab("📖 API Documentation"):
-        gr.Markdown("""
-        ### HTTP API
-        #### Initialize Object
-        ```bash
-        curl -X POST https://gpue-foundationpose.hf.space/api/initialize \\
-          -H "Content-Type: application/json" \\
-          -d '{
-            "object_id": "target_cube",
-            "reference_images_b64": ["<base64-encoded-jpeg>", ...],
-            "camera_intrinsics": "{\\"fx\\": 500, \\"fy\\": 500, \\"cx\\": 320, \\"cy\\": 240}"
-          }'
-        ```
-        #### Estimate Pose
-        ```bash
-        curl -X POST https://gpue-foundationpose.hf.space/api/estimate \\
-          -H "Content-Type: application/json" \\
-          -d '{
-            "object_id": "target_cube",
-            "query_image_b64": "<base64-encoded-jpeg>",
-            "camera_intrinsics": "{\\"fx\\": 500, \\"fy\\": 500, \\"cx\\": 320, \\"cy\\": 240}"
-          }'
-        ```
-        **Response Format:**
-        ```json
-        {
-          "success": true,
-          "poses": [
-            {
-              "object_id": "target_cube",
-              "position": {"x": 0.5, "y": 0.3, "z": 0.1},
-              "orientation": {"w": 1.0, "x": 0.0, "y": 0.0, "z": 0.0},
-              "confidence": 0.95,
-              "dimensions": [0.1, 0.1, 0.1]
-            }
-          ]
-        }
-        ```
-        ### Integration with robot-ml
-        ```python
-        from foundationpose.client import FoundationPoseClient
-        client = FoundationPoseClient("https://gpue-foundationpose.hf.space")
-        # Load reference images
-        ref_images = load_reference_images("./perception/reference/target_cube")
-        # Initialize object
-        client.initialize("target_cube", ref_images)
-        # Estimate pose
-        poses = client.estimate_pose("target_cube", query_image)
-        ```
-        """)
-    gr.Markdown("""
-    ---
-    **Citation:**
-    ```bibtex
-    @inproceedings{wen2023foundationpose,
-      title={FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects},
-      author={Wen, Bowen and Yang, Wei and Kautz, Jan and Birchfield, Stan},
-      booktitle={CVPR},
-      year={2024}
-    }
-    ```
-    [GitHub](https://github.com/NVlabs/FoundationPose) | [Paper](https://arxiv.org/abs/2312.08344)
-    """)
-# Launch app
 if __name__ == "__main__":
-    logger.info("=" * 60)
-    logger.info("FoundationPose Inference Server Starting")
-    logger.info(f"Mode: {'Real Model' if USE_REAL_MODEL else 'Placeholder'}")
-    logger.info("=" * 60)
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

 """
+Simple FoundationPose API server using FastAPI + Gradio
+This version uses FastAPI for clean REST API endpoints alongside Gradio UI.
 """
 import base64
 import json
 import logging
 import os
+from typing import Dict, List
 import cv2
 import gradio as gr
 import numpy as np
 import spaces
 import torch
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
 logging.basicConfig(
     level=logging.INFO,
 logger = logging.getLogger(__name__)
 # Check if running in real FoundationPose mode or placeholder mode
+USE_REAL_MODEL = os.environ.get("USE_REAL_MODEL", "false").lower() == "true"
 class FoundationPoseInference:
         self,
         object_id: str,
         reference_images: List[np.ndarray],
+        camera_intrinsics: Dict = None,
+        mesh_path: str = None
     ) -> bool:
+        """Register an object for tracking with reference images."""
         if not self.initialized:
             self.initialize_model()
         logger.info(f"Registering object '{object_id}' with {len(reference_images)} reference images")
         if self.use_real_model and self.model is not None:
             try:
                 success = self.model.register_object(
                     object_id=object_id,
                 logger.error(f"Registration failed: {e}", exc_info=True)
                 return False
         else:
             self.tracked_objects[object_id] = {
                 "num_references": len(reference_images),
                 "camera_intrinsics": camera_intrinsics,
             logger.info(f"✓ Object '{object_id}' registered (placeholder mode)")
             return True
+    @spaces.GPU(duration=10)
     def estimate_pose(
         self,
         object_id: str,
         query_image: np.ndarray,
+        camera_intrinsics: Dict = None,
+        depth_image: np.ndarray = None,
+        mask: np.ndarray = None
     ) -> Dict:
+        """Estimate 6D pose of an object in a query image."""
         if not self.initialized:
             return {"success": False, "error": "Model not initialized"}
         logger.info(f"Estimating pose for object '{object_id}'")
         if self.use_real_model and self.model is not None:
             try:
                 pose_result = self.model.estimate_pose(
                     object_id=object_id,
                 logger.error(f"Pose estimation error: {e}", exc_info=True)
                 return {"success": False, "error": str(e), "poses": []}
         else:
             logger.info("Placeholder mode: returning empty pose result")
             return {
                 "success": True,
 pose_estimator = FoundationPoseInference()
+# Pydantic models for API
+class InitializeRequest(BaseModel):
+    object_id: str
+    reference_images_b64: List[str]
+    camera_intrinsics: str = None
+    mesh_path: str = None
+class EstimateRequest(BaseModel):
+    object_id: str
+    query_image_b64: str
+    camera_intrinsics: str = None
+    depth_image_b64: str = None
+    mask_b64: str = None
+# Create FastAPI app
+app = FastAPI()
+@app.post("/api/initialize")
+async def api_initialize(request: InitializeRequest):
+    """Initialize object tracking with reference images."""
+    try:
         # Decode reference images
         reference_images = []
+        for img_b64 in request.reference_images_b64:
             img_bytes = base64.b64decode(img_b64)
             img_array = np.frombuffer(img_bytes, dtype=np.uint8)
             img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
             reference_images.append(img)
         # Parse camera intrinsics
+        intrinsics = json.loads(request.camera_intrinsics) if request.camera_intrinsics else None
         # Register object
         success = pose_estimator.register_object(
+            object_id=request.object_id,
             reference_images=reference_images,
             camera_intrinsics=intrinsics,
+            mesh_path=request.mesh_path
         )
         return {
             "success": success,
+            "message": f"Object '{request.object_id}' registered with {len(reference_images)} reference images"
         }
     except Exception as e:
         logger.error(f"Initialization error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/estimate")
+async def api_estimate(request: EstimateRequest):
+    """Estimate 6D pose from query image."""
     try:
         # Decode query image
+        img_bytes = base64.b64decode(request.query_image_b64)
         img_array = np.frombuffer(img_bytes, dtype=np.uint8)
         img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
         # Decode optional depth image
         depth = None
+        if request.depth_image_b64:
+            depth_bytes = base64.b64decode(request.depth_image_b64)
             depth = np.frombuffer(depth_bytes, dtype=np.float32)
         # Decode optional mask
         mask = None
+        if request.mask_b64:
+            mask_bytes = base64.b64decode(request.mask_b64)
             mask_array = np.frombuffer(mask_bytes, dtype=np.uint8)
             mask = cv2.imdecode(mask_array, cv2.IMREAD_GRAYSCALE)
         # Parse camera intrinsics
+        intrinsics = json.loads(request.camera_intrinsics) if request.camera_intrinsics else None
         # Estimate pose
         result = pose_estimator.estimate_pose(
+            object_id=request.object_id,
             query_image=img,
             camera_intrinsics=intrinsics,
             depth_image=depth,
     except Exception as e:
         logger.error(f"Estimation error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+# Gradio UI (simplified)
+with gr.Blocks(title="FoundationPose Inference", theme=gr.themes.Soft()) as gradio_app:
     gr.Markdown("# 🎯 FoundationPose 6D Object Pose Estimation")
     mode_indicator = gr.Markdown(
+        f"**Mode:** {'🟢 Real FoundationPose' if USE_REAL_MODEL else '🟡 Placeholder'}",
         elem_id="mode"
     )
     gr.Markdown("""
+    API Endpoints:
+    - POST `/api/initialize` - Register object
+    - POST `/api/estimate` - Estimate pose
+    See documentation for usage examples.
     """)
+# Mount Gradio to FastAPI
+app = gr.mount_gradio_app(app, gradio_app, path="/")
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

app_simple.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""
+Simple FoundationPose API server using FastAPI + Gradio
+This version uses FastAPI for clean REST API endpoints alongside Gradio UI.
+"""
+import base64
+import json
+import logging
+import os
+from typing import Dict, List
+import cv2
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] %(levelname)s: %(message)s"
+)
+logger = logging.getLogger(__name__)
+# Check if running in real FoundationPose mode or placeholder mode
+USE_REAL_MODEL = os.environ.get("USE_REAL_MODEL", "false").lower() == "true"
+class FoundationPoseInference:
+    """Wrapper for FoundationPose model inference."""
+    def __init__(self):
+        self.model = None
+        self.device = None
+        self.initialized = False
+        self.tracked_objects = {}
+        self.use_real_model = USE_REAL_MODEL
+    @spaces.GPU(duration=120)  # Allocate GPU for 120 seconds (includes model loading)
+    def initialize_model(self):
+        """Initialize the FoundationPose model on GPU."""
+        if self.initialized:
+            logger.info("Model already initialized")
+            return
+        logger.info("Initializing FoundationPose model...")
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info(f"Using device: {self.device}")
+        if self.use_real_model:
+            try:
+                logger.info("Loading real FoundationPose model...")
+                from estimator import FoundationPoseEstimator
+                self.model = FoundationPoseEstimator(
+                    device=str(self.device),
+                    weights_dir="weights"
+                )
+                logger.info("✓ Real FoundationPose model initialized successfully")
+            except Exception as e:
+                logger.error(f"Failed to initialize real model: {e}", exc_info=True)
+                logger.warning("Falling back to placeholder mode")
+                self.use_real_model = False
+                self.model = None
+        else:
+            logger.info("Using placeholder mode (set USE_REAL_MODEL=true for real inference)")
+            self.model = None
+        self.initialized = True
+        logger.info("FoundationPose inference ready")
+    def register_object(
+        self,
+        object_id: str,
+        reference_images: List[np.ndarray],
+        camera_intrinsics: Dict = None,
+        mesh_path: str = None
+    ) -> bool:
+        """Register an object for tracking with reference images."""
+        if not self.initialized:
+            self.initialize_model()
+        logger.info(f"Registering object '{object_id}' with {len(reference_images)} reference images")
+        if self.use_real_model and self.model is not None:
+            try:
+                success = self.model.register_object(
+                    object_id=object_id,
+                    reference_images=reference_images,
+                    camera_intrinsics=camera_intrinsics,
+                    mesh_path=mesh_path
+                )
+                if success:
+                    self.tracked_objects[object_id] = {
+                        "num_references": len(reference_images),
+                        "camera_intrinsics": camera_intrinsics,
+                        "mesh_path": mesh_path
+                    }
+                return success
+            except Exception as e:
+                logger.error(f"Registration failed: {e}", exc_info=True)
+                return False
+        else:
+            self.tracked_objects[object_id] = {
+                "num_references": len(reference_images),
+                "camera_intrinsics": camera_intrinsics,
+                "mesh_path": mesh_path
+            }
+            logger.info(f"✓ Object '{object_id}' registered (placeholder mode)")
+            return True
+    @spaces.GPU(duration=10)
+    def estimate_pose(
+        self,
+        object_id: str,
+        query_image: np.ndarray,
+        camera_intrinsics: Dict = None,
+        depth_image: np.ndarray = None,
+        mask: np.ndarray = None
+    ) -> Dict:
+        """Estimate 6D pose of an object in a query image."""
+        if not self.initialized:
+            return {"success": False, "error": "Model not initialized"}
+        if object_id not in self.tracked_objects:
+            return {"success": False, "error": f"Object '{object_id}' not registered"}
+        logger.info(f"Estimating pose for object '{object_id}'")
+        if self.use_real_model and self.model is not None:
+            try:
+                pose_result = self.model.estimate_pose(
+                    object_id=object_id,
+                    rgb_image=query_image,
+                    depth_image=depth_image,
+                    mask=mask,
+                    camera_intrinsics=camera_intrinsics
+                )
+                if pose_result is None:
+                    return {
+                        "success": False,
+                        "error": "Pose estimation returned None",
+                        "poses": []
+                    }
+                return {
+                    "success": True,
+                    "poses": [pose_result]
+                }
+            except Exception as e:
+                logger.error(f"Pose estimation error: {e}", exc_info=True)
+                return {"success": False, "error": str(e), "poses": []}
+        else:
+            logger.info("Placeholder mode: returning empty pose result")
+            return {
+                "success": True,
+                "poses": [],
+                "note": "Placeholder mode - set USE_REAL_MODEL=true for real inference"
+            }
+# Global model instance
+pose_estimator = FoundationPoseInference()
+# Pydantic models for API
+class InitializeRequest(BaseModel):
+    object_id: str
+    reference_images_b64: List[str]
+    camera_intrinsics: str = None
+    mesh_path: str = None
+class EstimateRequest(BaseModel):
+    object_id: str
+    query_image_b64: str
+    camera_intrinsics: str = None
+    depth_image_b64: str = None
+    mask_b64: str = None
+# Create FastAPI app
+app = FastAPI()
+@app.post("/api/initialize")
+async def api_initialize(request: InitializeRequest):
+    """Initialize object tracking with reference images."""
+    try:
+        # Decode reference images
+        reference_images = []
+        for img_b64 in request.reference_images_b64:
+            img_bytes = base64.b64decode(img_b64)
+            img_array = np.frombuffer(img_bytes, dtype=np.uint8)
+            img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            reference_images.append(img)
+        # Parse camera intrinsics
+        intrinsics = json.loads(request.camera_intrinsics) if request.camera_intrinsics else None
+        # Register object
+        success = pose_estimator.register_object(
+            object_id=request.object_id,
+            reference_images=reference_images,
+            camera_intrinsics=intrinsics,
+            mesh_path=request.mesh_path
+        )
+        return {
+            "success": success,
+            "message": f"Object '{request.object_id}' registered with {len(reference_images)} reference images"
+        }
+    except Exception as e:
+        logger.error(f"Initialization error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/estimate")
+async def api_estimate(request: EstimateRequest):
+    """Estimate 6D pose from query image."""
+    try:
+        # Decode query image
+        img_bytes = base64.b64decode(request.query_image_b64)
+        img_array = np.frombuffer(img_bytes, dtype=np.uint8)
+        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        # Decode optional depth image
+        depth = None
+        if request.depth_image_b64:
+            depth_bytes = base64.b64decode(request.depth_image_b64)
+            depth = np.frombuffer(depth_bytes, dtype=np.float32)
+        # Decode optional mask
+        mask = None
+        if request.mask_b64:
+            mask_bytes = base64.b64decode(request.mask_b64)
+            mask_array = np.frombuffer(mask_bytes, dtype=np.uint8)
+            mask = cv2.imdecode(mask_array, cv2.IMREAD_GRAYSCALE)
+        # Parse camera intrinsics
+        intrinsics = json.loads(request.camera_intrinsics) if request.camera_intrinsics else None
+        # Estimate pose
+        result = pose_estimator.estimate_pose(
+            object_id=request.object_id,
+            query_image=img,
+            camera_intrinsics=intrinsics,
+            depth_image=depth,
+            mask=mask
+        )
+        return result
+    except Exception as e:
+        logger.error(f"Estimation error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+# Gradio UI (simplified)
+with gr.Blocks(title="FoundationPose Inference", theme=gr.themes.Soft()) as gradio_app:
+    gr.Markdown("# 🎯 FoundationPose 6D Object Pose Estimation")
+    mode_indicator = gr.Markdown(
+        f"**Mode:** {'🟢 Real FoundationPose' if USE_REAL_MODEL else '🟡 Placeholder'}",
+        elem_id="mode"
+    )
+    gr.Markdown("""
+    API Endpoints:
+    - POST `/api/initialize` - Register object
+    - POST `/api/estimate` - Estimate pose
+    See documentation for usage examples.
+    """)
+# Mount Gradio to FastAPI
+app = gr.mount_gradio_app(app, gradio_app, path="/")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt CHANGED Viewed

@@ -5,6 +5,11 @@ numpy>=1.24.0
 opencv-python>=4.8.0
 Pillow>=10.0.0
 # Hugging Face
 huggingface_hub>=0.20.0

 opencv-python>=4.8.0
 Pillow>=10.0.0
+# FastAPI for REST API endpoints
+fastapi>=0.109.0
+uvicorn>=0.27.0
+pydantic>=2.0.0
 # Hugging Face
 huggingface_hub>=0.20.0