Spaces:

gpue
/

foundationpose

Sleeping

App Files Files Community

Georg commited on about 1 month ago

Commit

9550d40

1 Parent(s): f592ee6

Implement real FoundationPose inference with model-based pose estimation

Browse files

Files changed (1) hide show

estimator.py +201 -9

estimator.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import Dict, List, Optional
 import numpy as np
 import torch
 logger = logging.getLogger(__name__)
@@ -19,6 +20,18 @@ FOUNDATIONPOSE_ROOT = Path("/app/FoundationPose")
 if FOUNDATIONPOSE_ROOT.exists():
     sys.path.insert(0, str(FOUNDATIONPOSE_ROOT))
 class FoundationPoseEstimator:
     """Wrapper for FoundationPose model."""
@@ -32,8 +45,10 @@ class FoundationPoseEstimator:
         """
         self.device = device
         self.weights_dir = Path(weights_dir)
-        self.model = None
         self.registered_objects = {}
         # Check if FoundationPose is available
         if not FOUNDATIONPOSE_ROOT.exists():
@@ -42,11 +57,16 @@ class FoundationPoseEstimator:
                 "Clone it with: git clone https://github.com/NVlabs/FoundationPose.git"
             )
         # Check if weights exist
         if not self.weights_dir.exists() or not any(self.weights_dir.glob("**/*.pth")):
             logger.warning(f"No model weights found in {self.weights_dir}")
             logger.warning("Model will not work without weights")
         logger.info(f"FoundationPose estimator initialized (device: {device})")
     def register_object(
@@ -68,12 +88,24 @@ class FoundationPoseEstimator:
             True if registration successful
         """
         try:
             # Store object registration
             self.registered_objects[object_id] = {
                 "num_references": len(reference_images),
                 "camera_intrinsics": camera_intrinsics,
                 "mesh_path": mesh_path,
-                "reference_images": reference_images  # Keep for now
             }
             logger.info(f"✓ Registered object '{object_id}' with {len(reference_images)} reference images")
@@ -107,16 +139,176 @@ class FoundationPoseEstimator:
             logger.error(f"Object '{object_id}' not registered")
             return None
         try:
-            # TODO: Implement actual FoundationPose inference
-            # This is a placeholder that would need to:
-            # 1. Load the FoundationPose model if not loaded
-            # 2. Run pose estimation on the query image
-            # 3. Return the estimated pose
-            logger.warning("FoundationPose inference not yet implemented - returning None")
-            return None
         except Exception as e:
             logger.error(f"Pose estimation failed: {e}", exc_info=True)
             return None

 import numpy as np
 import torch
+import cv2
 logger = logging.getLogger(__name__)
 if FOUNDATIONPOSE_ROOT.exists():
     sys.path.insert(0, str(FOUNDATIONPOSE_ROOT))
+# Try to import FoundationPose modules
+try:
+    from estimater import FoundationPose
+    from learning.training.predict_score import ScorePredictor
+    from learning.training.predict_pose_refine import PoseRefinePredictor
+    import nvdiffrast.torch as dr
+    import trimesh
+    FOUNDATIONPOSE_AVAILABLE = True
+except ImportError as e:
+    logger.warning(f"FoundationPose modules not available: {e}")
+    FOUNDATIONPOSE_AVAILABLE = False
 class FoundationPoseEstimator:
     """Wrapper for FoundationPose model."""
         """
         self.device = device
         self.weights_dir = Path(weights_dir)
         self.registered_objects = {}
+        self.scorer = None
+        self.refiner = None
+        self.glctx = None
         # Check if FoundationPose is available
         if not FOUNDATIONPOSE_ROOT.exists():
                 "Clone it with: git clone https://github.com/NVlabs/FoundationPose.git"
             )
+        if not FOUNDATIONPOSE_AVAILABLE:
+            logger.warning("FoundationPose modules not loaded - inference will not work")
+            return
         # Check if weights exist
         if not self.weights_dir.exists() or not any(self.weights_dir.glob("**/*.pth")):
             logger.warning(f"No model weights found in {self.weights_dir}")
             logger.warning("Model will not work without weights")
+        # Initialize predictors (lazy loading - only when needed)
         logger.info(f"FoundationPose estimator initialized (device: {device})")
     def register_object(
             True if registration successful
         """
         try:
+            # Load mesh if provided
+            mesh = None
+            if mesh_path and Path(mesh_path).exists():
+                try:
+                    mesh = trimesh.load(mesh_path)
+                    logger.info(f"Loaded mesh for '{object_id}' from {mesh_path}")
+                except Exception as e:
+                    logger.warning(f"Failed to load mesh: {e}")
             # Store object registration
             self.registered_objects[object_id] = {
                 "num_references": len(reference_images),
                 "camera_intrinsics": camera_intrinsics,
                 "mesh_path": mesh_path,
+                "mesh": mesh,
+                "reference_images": reference_images,
+                "estimator": None,  # Will be created lazily
+                "pose_last": None   # Track last pose for temporal tracking
             }
             logger.info(f"✓ Registered object '{object_id}' with {len(reference_images)} reference images")
             logger.error(f"Object '{object_id}' not registered")
             return None
+        if not FOUNDATIONPOSE_AVAILABLE:
+            logger.error("FoundationPose not available")
+            return None
         try:
+            obj_data = self.registered_objects[object_id]
+            # Initialize predictors if not done yet
+            if self.scorer is None:
+                logger.info("Initializing score predictor...")
+                self.scorer = ScorePredictor()
+                logger.info("Initializing pose refiner...")
+                self.refiner = PoseRefinePredictor()
+                logger.info("Initializing CUDA rasterizer...")
+                self.glctx = dr.RasterizeCudaContext()
+            # Initialize object-specific estimator if not done yet
+            if obj_data["estimator"] is None:
+                logger.info(f"Creating FoundationPose estimator for '{object_id}'...")
+                mesh = obj_data["mesh"]
+                if mesh is not None:
+                    # Model-based mode: use mesh
+                    logger.info("Using model-based mode with mesh")
+                    obj_data["estimator"] = FoundationPose(
+                        model_pts=mesh.vertices,
+                        model_normals=mesh.vertex_normals,
+                        mesh=mesh,
+                        scorer=self.scorer,
+                        refiner=self.refiner,
+                        glctx=self.glctx,
+                        debug=0
+                    )
+                else:
+                    # Model-free mode: would need reference-based initialization
+                    # For now, return error
+                    logger.error("Model-free mode not yet implemented - mesh required")
+                    return None
+            estimator = obj_data["estimator"]
+            # Prepare camera intrinsics matrix
+            K = self._get_camera_matrix(camera_intrinsics or obj_data["camera_intrinsics"])
+            if K is None:
+                logger.error("Camera intrinsics required")
+                return None
+            # Generate or use depth if not provided
+            if depth_image is None:
+                # Create dummy depth for model-based case
+                depth_image = np.ones((rgb_image.shape[0], rgb_image.shape[1]), dtype=np.float32) * 0.5
+            # Generate mask if not provided
+            if mask is None:
+                # Use simple foreground detection or full image
+                mask = np.ones((rgb_image.shape[0], rgb_image.shape[1]), dtype=bool)
+            # First frame or lost tracking: register
+            if obj_data["pose_last"] is None:
+                logger.info("Running registration (first frame)...")
+                pose = estimator.register(
+                    K=K,
+                    rgb=rgb_image,
+                    depth=depth_image,
+                    ob_mask=mask,
+                    iteration=5  # Number of refinement iterations
+                )
+            else:
+                # Subsequent frames: track
+                pose = estimator.track_one(
+                    rgb=rgb_image,
+                    depth=depth_image,
+                    K=K,
+                    iteration=2  # Fewer iterations for tracking
+                )
+            # Store pose for next frame
+            obj_data["pose_last"] = pose
+            if pose is None:
+                logger.warning("Pose estimation returned None")
+                return None
+            # Convert pose to our format
+            # pose is a 4x4 transformation matrix
+            return self._format_pose_output(pose)
         except Exception as e:
             logger.error(f"Pose estimation failed: {e}", exc_info=True)
+            import traceback
+            traceback.print_exc()
             return None
+    def _get_camera_matrix(self, intrinsics: Optional[Dict]) -> Optional[np.ndarray]:
+        """Convert intrinsics dict to camera matrix."""
+        if intrinsics is None:
+            return None
+        fx = intrinsics.get("fx")
+        fy = intrinsics.get("fy")
+        cx = intrinsics.get("cx")
+        cy = intrinsics.get("cy")
+        if None in [fx, fy, cx, cy]:
+            return None
+        K = np.array([
+            [fx, 0, cx],
+            [0, fy, cy],
+            [0, 0, 1]
+        ], dtype=np.float32)
+        return K
+    def _format_pose_output(self, pose_matrix: np.ndarray) -> Dict:
+        """Convert 4x4 pose matrix to output format.
+        Args:
+            pose_matrix: 4x4 transformation matrix
+        Returns:
+            Dictionary with position, orientation (quaternion), and confidence
+        """
+        # Extract translation
+        translation = pose_matrix[:3, 3]
+        # Extract rotation matrix
+        rotation_matrix = pose_matrix[:3, :3]
+        # Convert rotation matrix to quaternion
+        # Using Shepperd's method for numerical stability
+        trace = np.trace(rotation_matrix)
+        if trace > 0:
+            s = np.sqrt(trace + 1.0) * 2
+            w = 0.25 * s
+            x = (rotation_matrix[2, 1] - rotation_matrix[1, 2]) / s
+            y = (rotation_matrix[0, 2] - rotation_matrix[2, 0]) / s
+            z = (rotation_matrix[1, 0] - rotation_matrix[0, 1]) / s
+        elif rotation_matrix[0, 0] > rotation_matrix[1, 1] and rotation_matrix[0, 0] > rotation_matrix[2, 2]:
+            s = np.sqrt(1.0 + rotation_matrix[0, 0] - rotation_matrix[1, 1] - rotation_matrix[2, 2]) * 2
+            w = (rotation_matrix[2, 1] - rotation_matrix[1, 2]) / s
+            x = 0.25 * s
+            y = (rotation_matrix[0, 1] + rotation_matrix[1, 0]) / s
+            z = (rotation_matrix[0, 2] + rotation_matrix[2, 0]) / s
+        elif rotation_matrix[1, 1] > rotation_matrix[2, 2]:
+            s = np.sqrt(1.0 + rotation_matrix[1, 1] - rotation_matrix[0, 0] - rotation_matrix[2, 2]) * 2
+            w = (rotation_matrix[0, 2] - rotation_matrix[2, 0]) / s
+            x = (rotation_matrix[0, 1] + rotation_matrix[1, 0]) / s
+            y = 0.25 * s
+            z = (rotation_matrix[1, 2] + rotation_matrix[2, 1]) / s
+        else:
+            s = np.sqrt(1.0 + rotation_matrix[2, 2] - rotation_matrix[0, 0] - rotation_matrix[1, 1]) * 2
+            w = (rotation_matrix[1, 0] - rotation_matrix[0, 1]) / s
+            x = (rotation_matrix[0, 2] + rotation_matrix[2, 0]) / s
+            y = (rotation_matrix[1, 2] + rotation_matrix[2, 1]) / s
+            z = 0.25 * s
+        return {
+            "position": {
+                "x": float(translation[0]),
+                "y": float(translation[1]),
+                "z": float(translation[2])
+            },
+            "orientation": {
+                "w": float(w),
+                "x": float(x),
+                "y": float(y),
+                "z": float(z)
+            },
+            "confidence": 1.0,  # FoundationPose doesn't provide explicit confidence
+            "pose_matrix": pose_matrix.tolist()
+        }