Spaces:

gpue
/

foundationpose

Sleeping

App Files Files Community

Georg commited on Jan 29

Commit

053c7f6

1 Parent(s): f7e2564

Optimized Docker build to fix OOM errors

Browse files

Files changed (3) hide show

app.py +90 -4
estimator.py +38 -24
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -16,6 +16,59 @@ import gradio as gr
 import numpy as np
 import torch
 logging.basicConfig(
     level=logging.INFO,
     format="[%(asctime)s] %(levelname)s: %(message)s"
@@ -262,7 +315,16 @@ def gradio_initialize_model_free(object_id: str, reference_files: List, fx: floa
         return f"Error: {str(e)}"
-def gradio_estimate(object_id: str, query_image: np.ndarray, depth_image: np.ndarray, fx: float, fy: float, cx: float, cy: float):
     """Gradio wrapper for pose estimation."""
     try:
         if query_image is None:
@@ -304,12 +366,28 @@ def gradio_estimate(object_id: str, query_image: np.ndarray, depth_image: np.nda
             "cy": cy
         }
         # Estimate pose
         result = pose_estimator.estimate_pose(
             object_id=object_id,
             query_image=query_image,
             depth_image=depth,
-            camera_intrinsics=camera_intrinsics
         )
         if not result.get("success"):
@@ -318,7 +396,8 @@ def gradio_estimate(object_id: str, query_image: np.ndarray, depth_image: np.nda
         poses = result.get("poses", [])
         note = result.get("note", "")
-        debug_mask = result.get("debug_mask", None)
         # Create mask visualization
         mask_vis = None
@@ -524,6 +603,12 @@ with gr.Blocks(title="FoundationPose Inference", theme=gr.themes.Soft()) as demo
                         type="numpy"
                     )
                     gr.Markdown("### Camera Intrinsics")
                     with gr.Row():
                         est_fx = gr.Number(label="fx (focal length x)", value=500.0)
@@ -545,7 +630,7 @@ with gr.Blocks(title="FoundationPose Inference", theme=gr.themes.Soft()) as demo
             est_button.click(
                 fn=gradio_estimate,
-                inputs=[est_object_id, est_query_image, est_depth_image, est_fx, est_fy, est_cx, est_cy],
                 outputs=[est_output, est_viz, est_mask]
             )
@@ -573,6 +658,7 @@ with gr.Blocks(title="FoundationPose Inference", theme=gr.themes.Soft()) as demo
         object_id="target_cube",
         query_image=image,
         fx=500.0, fy=500.0, cx=320.0, cy=240.0,
         api_name="/gradio_estimate"
     )
     ```

 import numpy as np
 import torch
+from estimator import generate_naive_mask
+_slimsam_model = None
+_slimsam_processor = None
+_slimsam_device = None
+def _get_slimsam():
+    """Lazy-load SlimSAM to avoid heavy startup cost."""
+    global _slimsam_model, _slimsam_processor, _slimsam_device
+    if _slimsam_model is None or _slimsam_processor is None:
+        from transformers import SamModel, SamProcessor
+        _slimsam_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        _slimsam_model = SamModel.from_pretrained("nielsr/slimsam-50-uniform").to(_slimsam_device)
+        _slimsam_processor = SamProcessor.from_pretrained("nielsr/slimsam-50-uniform")
+        logger.info("SlimSAM loaded on %s", _slimsam_device)
+    return _slimsam_model, _slimsam_processor, _slimsam_device
+def _box_from_mask(mask_bool: np.ndarray) -> List[int]:
+    ys, xs = np.where(mask_bool)
+    if len(xs) == 0:
+        return [0, 0, mask_bool.shape[1] - 1, mask_bool.shape[0] - 1]
+    x0, x1 = int(xs.min()), int(xs.max())
+    y0, y1 = int(ys.min()), int(ys.max())
+    return [x0, y0, x1, y1]
+def generate_slimsam_mask(rgb_image: np.ndarray, box_prompt: List[int]) -> tuple[np.ndarray, np.ndarray, float]:
+    """Generate a SlimSAM mask using a box prompt."""
+    from PIL import Image
+    model, processor, device = _get_slimsam()
+    raw_image = Image.fromarray(rgb_image).convert("RGB")
+    inputs = processor(raw_image, input_boxes=[[box_prompt]], return_tensors="pt").to(device)
+    outputs = model(**inputs)
+    masks = processor.image_processor.post_process_masks(
+        outputs.pred_masks.cpu(),
+        inputs["original_sizes"].cpu(),
+        inputs["reshaped_input_sizes"].cpu(),
+    )[0]
+    scores = outputs.iou_scores.squeeze().cpu()
+    best_idx = int(scores.argmax().item())
+    best_mask = masks[0, best_idx].numpy()
+    best_score = float(scores[best_idx].item())
+    mask_bool = best_mask.astype(bool)
+    debug_mask = (mask_bool.astype(np.uint8) * 255)
+    return mask_bool, debug_mask, best_score
 logging.basicConfig(
     level=logging.INFO,
     format="[%(asctime)s] %(levelname)s: %(message)s"
         return f"Error: {str(e)}"
+def gradio_estimate(
+    object_id: str,
+    query_image: np.ndarray,
+    depth_image: np.ndarray,
+    fx: float,
+    fy: float,
+    cx: float,
+    cy: float,
+    mask_method: str
+):
     """Gradio wrapper for pose estimation."""
     try:
         if query_image is None:
             "cy": cy
         }
+        # Choose mask method
+        mask = None
+        debug_mask = None
+        if mask_method == "SlimSAM":
+            # Use Otsu mask as a box prompt to guide SlimSAM
+            naive_mask, _, _, _ = generate_naive_mask(query_image)
+            box_prompt = _box_from_mask(naive_mask)
+            mask, debug_mask, score = generate_slimsam_mask(query_image, box_prompt)
+            logger.info("SlimSAM mask generated (score=%.3f, box=%s)", score, box_prompt)
+        elif mask_method == "Otsu":
+            mask, debug_mask, mask_percentage, fallback_full_image = generate_naive_mask(query_image)
+            logger.info("Otsu mask coverage %.1f%%", mask_percentage)
+            if fallback_full_image:
+                logger.warning("Otsu mask fallback to full image due to unrealistic coverage")
         # Estimate pose
         result = pose_estimator.estimate_pose(
             object_id=object_id,
             query_image=query_image,
             depth_image=depth,
+            camera_intrinsics=camera_intrinsics,
+            mask=mask
         )
         if not result.get("success"):
         poses = result.get("poses", [])
         note = result.get("note", "")
+        if debug_mask is None:
+            debug_mask = result.get("debug_mask", None)
         # Create mask visualization
         mask_vis = None
                         type="numpy"
                     )
+                    est_mask_method = gr.Radio(
+                        choices=["SlimSAM", "Otsu"],
+                        value="SlimSAM",
+                        label="Mask Method"
+                    )
                     gr.Markdown("### Camera Intrinsics")
                     with gr.Row():
                         est_fx = gr.Number(label="fx (focal length x)", value=500.0)
             est_button.click(
                 fn=gradio_estimate,
+                inputs=[est_object_id, est_query_image, est_depth_image, est_fx, est_fy, est_cx, est_cy, est_mask_method],
                 outputs=[est_output, est_viz, est_mask]
             )
         object_id="target_cube",
         query_image=image,
         fx=500.0, fy=500.0, cx=320.0, cy=240.0,
+        mask_method="SlimSAM",
         api_name="/gradio_estimate"
     )
     ```

estimator.py CHANGED Viewed

@@ -33,6 +33,39 @@ except ImportError as e:
     FOUNDATIONPOSE_AVAILABLE = False
 class FoundationPoseEstimator:
     """Wrapper for FoundationPose model."""
@@ -206,31 +239,12 @@ class FoundationPoseEstimator:
                 # Use automatic foreground segmentation based on brightness
                 # This works well for light objects on dark backgrounds
                 logger.info("Generating automatic object mask from image")
-                gray = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2GRAY)
-                # Use Otsu's thresholding for automatic threshold selection
-                _, mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-                # Clean up mask with morphological operations
-                kernel = np.ones((5, 5), np.uint8)
-                mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)  # Fill holes
-                mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)   # Remove noise
-                # Store visualization version (uint8) before converting to boolean
-                debug_mask = mask.copy()
-                # Convert to boolean
-                mask = mask.astype(bool)
-                # Log mask statistics
-                mask_percentage = (mask.sum() / mask.size) * 100
                 logger.info(f"Auto-generated mask covers {mask_percentage:.1f}% of image")
-                # If mask is too large or too small, fall back to full image
-                if mask_percentage < 1 or mask_percentage > 90:
-                    logger.warning(f"Mask coverage ({mask_percentage:.1f}%) seems unrealistic, using full image")
-                    mask = np.ones((rgb_image.shape[0], rgb_image.shape[1]), dtype=bool)
-                    debug_mask = np.ones((rgb_image.shape[0], rgb_image.shape[1]), dtype=np.uint8) * 255
                 mask_was_generated = True

     FOUNDATIONPOSE_AVAILABLE = False
+def generate_naive_mask(
+    rgb_image: np.ndarray,
+    min_percentage: float = 1.0,
+    max_percentage: float = 90.0
+) -> tuple[np.ndarray, np.ndarray, float, bool]:
+    """Generate a naive foreground mask using brightness + Otsu thresholding.
+    Returns:
+        mask_bool: Boolean mask (H, W)
+        debug_mask: uint8 mask for visualization (H, W)
+        mask_percentage: % of pixels active in mask_bool
+        fallback_full_image: True if the mask was replaced by full-image mask
+    """
+    gray = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2GRAY)
+    _, mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    kernel = np.ones((5, 5), np.uint8)
+    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)  # Fill holes
+    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)   # Remove noise
+    debug_mask = mask.copy()
+    mask_bool = mask.astype(bool)
+    mask_percentage = (mask_bool.sum() / mask_bool.size) * 100
+    fallback_full_image = False
+    if mask_percentage < min_percentage or mask_percentage > max_percentage:
+        fallback_full_image = True
+        mask_bool = np.ones((rgb_image.shape[0], rgb_image.shape[1]), dtype=bool)
+        debug_mask = np.ones((rgb_image.shape[0], rgb_image.shape[1]), dtype=np.uint8) * 255
+    return mask_bool, debug_mask, mask_percentage, fallback_full_image
 class FoundationPoseEstimator:
     """Wrapper for FoundationPose model."""
                 # Use automatic foreground segmentation based on brightness
                 # This works well for light objects on dark backgrounds
                 logger.info("Generating automatic object mask from image")
+                mask, debug_mask, mask_percentage, fallback_full_image = generate_naive_mask(rgb_image)
                 logger.info(f"Auto-generated mask covers {mask_percentage:.1f}% of image")
+                if fallback_full_image:
+                    logger.warning(
+                        f"Mask coverage ({mask_percentage:.1f}%) seems unrealistic, using full image"
+                    )
                 mask_was_generated = True

requirements.txt CHANGED Viewed

@@ -4,6 +4,8 @@ numpy>=1.24.0
 opencv-python-headless>=4.8.0  # Headless version saves ~400MB
 Pillow>=10.0.0
 huggingface-hub>=0.20.0
 # Note: torch and torchvision are installed separately with CUDA support
 # Note: FoundationPose C++ extensions built at runtime

 opencv-python-headless>=4.8.0  # Headless version saves ~400MB
 Pillow>=10.0.0
 huggingface-hub>=0.20.0
+matplotlib>=3.8.0
+transformers>=4.38.0
 # Note: torch and torchvision are installed separately with CUDA support
 # Note: FoundationPose C++ extensions built at runtime