Spaces:

dev-bjoern
/

sam3d-objects-mcp

Running on Zero

App Files Files Community

dev-bjoern commited on Dec 8, 2025

Commit

1ed02fd

1 Parent(s): af69327

SAM3 text/click segmentation + SAM 3D Objects reconstruction

Browse files

Files changed (1) hide show

app.py +110 -72

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """
 SAM 3D Objects MCP Server
-Image + Click → 3D Object (GLB)
 """
 import os
 import sys
@@ -30,39 +32,38 @@ if not SAM3D_PATH.exists():
     ], check=True)
     sys.path.insert(0, str(SAM3D_PATH))
-# Add to path
 sys.path.insert(0, str(SAM3D_PATH))
 # Global models
 SAM3D_MODEL = None
-SAM_PREDICTOR = None
-def load_sam_model():
-    """Load SAM3 model for segmentation"""
-    global SAM_PREDICTOR
-    if SAM_PREDICTOR is not None:
-        return SAM_PREDICTOR
     import torch
-    from sam3 import SAM3ImagePredictor
     print("Loading SAM3 model...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    SAM_PREDICTOR = SAM3ImagePredictor.from_pretrained(
         "facebook/sam3-hiera-large",
         device=device,
         token=os.environ.get("HF_TOKEN")
     )
-    print("✓ SAM3 model loaded")
-    return SAM_PREDICTOR
-def load_sam3d_model():
     """Load SAM 3D Objects model"""
     global SAM3D_MODEL
@@ -72,7 +73,6 @@ def load_sam3d_model():
     import torch
     print("Loading SAM 3D Objects model...")
-    # Download checkpoint
     checkpoint_dir = snapshot_download(
         repo_id="facebook/sam-3d-objects",
         token=os.environ.get("HF_TOKEN")
@@ -81,39 +81,60 @@ def load_sam3d_model():
     from sam_3d_objects import Sam3dObjects
     device = "cuda" if torch.cuda.is_available() else "cpu"
     SAM3D_MODEL = Sam3dObjects.from_pretrained(checkpoint_dir, device=device)
-    print("✓ SAM 3D Objects model loaded")
     return SAM3D_MODEL
 @spaces.GPU(duration=60)
-def segment_object(image: np.ndarray, evt: gr.SelectData) -> np.ndarray:
-    """
-    Segment object at clicked point using SAM2.
-    Args:
-        image: Input RGB image
-        evt: Click event with coordinates
-    Returns:
-        Image with mask overlay
-    """
     if image is None:
-        return None
     try:
-        predictor = load_sam_model()
         # Get click coordinates
         point = np.array([[evt.index[0], evt.index[1]]])
-        label = np.array([1])  # 1 = foreground
-        # Set image
         predictor.set_image(image)
-        # Predict mask
         masks, scores, _ = predictor.predict(
             point_coords=point,
             point_labels=label,
@@ -121,28 +142,29 @@ def segment_object(image: np.ndarray, evt: gr.SelectData) -> np.ndarray:
         )
         # Use best mask
-        best_mask = masks[np.argmax(scores)]
         # Create overlay
         overlay = image.copy()
-        overlay[best_mask] = overlay[best_mask] * 0.5 + np.array([0, 255, 0]) * 0.5
-        return overlay, best_mask.astype(np.uint8) * 255
     except Exception as e:
         import traceback
         traceback.print_exc()
-        return image, None
 @spaces.GPU(duration=120)
-def reconstruct_object(image: np.ndarray, mask: np.ndarray) -> tuple:
     """
     Reconstruct 3D object from image and mask.
     Args:
         image: Input RGB image
-        mask: Binary mask indicating object region
     Returns:
         tuple: (glb_path, status)
@@ -150,46 +172,37 @@ def reconstruct_object(image: np.ndarray, mask: np.ndarray) -> tuple:
     if image is None:
         return None, "❌ No image provided"
     if mask is None:
-        return None, "❌ No mask provided - click on object first"
     try:
         import torch
         import trimesh
-        model = load_sam3d_model()
-        # Process image
-        if isinstance(image, Image.Image):
-            image = np.array(image)
-        # Process mask
-        if isinstance(mask, Image.Image):
-            mask = np.array(mask)
-        # Convert mask to binary if needed
         if len(mask.shape) == 3:
             mask = mask[:, :, 0]
         mask = (mask > 127).astype(np.uint8)
-        # Run inference
         outputs = model.predict(image, mask)
         if outputs is None:
             return None, "⚠️ Reconstruction failed"
-        # Export as GLB via trimesh
         output_dir = tempfile.mkdtemp()
         glb_path = f"{output_dir}/object_{uuid.uuid4().hex[:8]}.glb"
-        # Get vertices and faces from gaussian splat
-        # Convert to mesh and export as GLB
         vertices = outputs.get_xyz().cpu().numpy()
-        # Create point cloud mesh (gaussian splats don't have faces directly)
-        # We'll export as a point cloud GLB
         cloud = trimesh.PointCloud(vertices)
         cloud.export(glb_path, file_type='glb')
-        return glb_path, f"✓ Object reconstructed ({len(vertices)} points)"
     except Exception as e:
         import traceback
@@ -199,39 +212,57 @@ def reconstruct_object(image: np.ndarray, mask: np.ndarray) -> tuple:
 # Gradio Interface
 with gr.Blocks(title="SAM 3D Objects MCP") as demo:
-    gr.Markdown("# 📦 SAM 3D Objects MCP Server\n**Click on object → 3D Reconstruction (GLB)**")
-    # State for mask
     mask_state = gr.State(None)
     with gr.Row():
         with gr.Column():
-            input_image = gr.Image(label="Input Image (click on object)", type="numpy")
-            gr.Markdown("*Click on the object you want to reconstruct*")
         with gr.Column():
-            preview_image = gr.Image(label="Segmentation Preview", type="numpy", interactive=False)
     with gr.Row():
-        btn = gr.Button("🎯 Reconstruct 3D", variant="primary", size="lg")
     with gr.Row():
-        with gr.Column():
-            output_model = gr.Model3D(label="3D Object")
-            output_file = gr.File(label="Download GLB")
-        with gr.Column():
-            status = gr.Textbox(label="Status")
-    # Click to segment
     input_image.select(
-        segment_object,
         inputs=[input_image],
-        outputs=[preview_image, mask_state]
     )
-    # Reconstruct
-    btn.click(
-        reconstruct_object,
         inputs=[input_image, mask_state],
         outputs=[output_file, status]
     )
@@ -240,7 +271,14 @@ with gr.Blocks(title="SAM 3D Objects MCP") as demo:
     ---
     ### MCP Server
     ```json
-    {"mcpServers": {"sam3d-objects": {"command": "npx", "args": ["mcp-remote", "URL/gradio_api/mcp/sse"]}}}
     ```
     """)

 """
 SAM 3D Objects MCP Server
+Image + Text/Click → 3D Object (GLB)
+Uses SAM3 for segmentation and SAM 3D Objects for 3D reconstruction.
 """
 import os
 import sys
     ], check=True)
     sys.path.insert(0, str(SAM3D_PATH))
 sys.path.insert(0, str(SAM3D_PATH))
 # Global models
 SAM3D_MODEL = None
+SAM3_PREDICTOR = None
+def load_sam3():
+    """Load SAM3 for segmentation"""
+    global SAM3_PREDICTOR
+    if SAM3_PREDICTOR is not None:
+        return SAM3_PREDICTOR
     import torch
+    from sam3 import SAM3Predictor
     print("Loading SAM3 model...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    SAM3_PREDICTOR = SAM3Predictor.from_pretrained(
         "facebook/sam3-hiera-large",
         device=device,
         token=os.environ.get("HF_TOKEN")
     )
+    print("✓ SAM3 loaded")
+    return SAM3_PREDICTOR
+def load_sam3d():
     """Load SAM 3D Objects model"""
     global SAM3D_MODEL
     import torch
     print("Loading SAM 3D Objects model...")
     checkpoint_dir = snapshot_download(
         repo_id="facebook/sam-3d-objects",
         token=os.environ.get("HF_TOKEN")
     from sam_3d_objects import Sam3dObjects
     device = "cuda" if torch.cuda.is_available() else "cpu"
     SAM3D_MODEL = Sam3dObjects.from_pretrained(checkpoint_dir, device=device)
+    print("✓ SAM 3D Objects loaded")
     return SAM3D_MODEL
 @spaces.GPU(duration=60)
+def segment_with_text(image: np.ndarray, text_prompt: str):
+    """Segment object using text prompt with SAM3"""
+    if image is None:
+        return None, None, "❌ No image provided"
+    if not text_prompt:
+        return None, None, "❌ No text prompt provided"
+    try:
+        predictor = load_sam3()
+        # Run SAM3 with text prompt
+        predictor.set_image(image)
+        masks, scores, _ = predictor.predict(text=text_prompt)
+        if masks is None or len(masks) == 0:
+            return image, None, "⚠️ No object found"
+        # Use best mask
+        best_idx = np.argmax(scores)
+        mask = masks[best_idx]
+        # Create overlay
+        overlay = image.copy()
+        overlay[mask] = (overlay[mask] * 0.5 + np.array([0, 255, 0]) * 0.5).astype(np.uint8)
+        return overlay, mask.astype(np.uint8) * 255, f"✓ Found: {text_prompt}"
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return image, None, f"❌ Error: {e}"
+@spaces.GPU(duration=60)
+def segment_with_click(image: np.ndarray, evt: gr.SelectData):
+    """Segment object at clicked point with SAM3"""
     if image is None:
+        return None, None, "❌ No image provided"
     try:
+        predictor = load_sam3()
         # Get click coordinates
         point = np.array([[evt.index[0], evt.index[1]]])
+        label = np.array([1])  # foreground
         predictor.set_image(image)
         masks, scores, _ = predictor.predict(
             point_coords=point,
             point_labels=label,
         )
         # Use best mask
+        best_idx = np.argmax(scores)
+        mask = masks[best_idx]
         # Create overlay
         overlay = image.copy()
+        overlay[mask] = (overlay[mask] * 0.5 + np.array([0, 255, 0]) * 0.5).astype(np.uint8)
+        return overlay, mask.astype(np.uint8) * 255, "✓ Object selected"
     except Exception as e:
         import traceback
         traceback.print_exc()
+        return image, None, f"❌ Error: {e}"
 @spaces.GPU(duration=120)
+def reconstruct_3d(image: np.ndarray, mask: np.ndarray):
     """
     Reconstruct 3D object from image and mask.
     Args:
         image: Input RGB image
+        mask: Binary mask from SAM3
     Returns:
         tuple: (glb_path, status)
     if image is None:
         return None, "❌ No image provided"
     if mask is None:
+        return None, "❌ No mask - segment object first"
     try:
         import torch
         import trimesh
+        model = load_sam3d()
+        # Ensure mask is binary
         if len(mask.shape) == 3:
             mask = mask[:, :, 0]
         mask = (mask > 127).astype(np.uint8)
+        # Run 3D reconstruction
         outputs = model.predict(image, mask)
         if outputs is None:
             return None, "⚠️ Reconstruction failed"
+        # Export as GLB
         output_dir = tempfile.mkdtemp()
         glb_path = f"{output_dir}/object_{uuid.uuid4().hex[:8]}.glb"
+        # Get vertices from gaussian splat
         vertices = outputs.get_xyz().cpu().numpy()
+        # Export as point cloud GLB
         cloud = trimesh.PointCloud(vertices)
         cloud.export(glb_path, file_type='glb')
+        return glb_path, f"✓ Reconstructed ({len(vertices)} points)"
     except Exception as e:
         import traceback
 # Gradio Interface
 with gr.Blocks(title="SAM 3D Objects MCP") as demo:
+    gr.Markdown("""
+    # 📦 SAM 3D Objects MCP Server
+    **Image → 3D Object (GLB)**
+    1. Upload image
+    2. Segment: Type what to select OR click on object
+    3. Reconstruct 3D
+    """)
     mask_state = gr.State(None)
     with gr.Row():
         with gr.Column():
+            input_image = gr.Image(label="Input Image", type="numpy")
+            with gr.Row():
+                text_prompt = gr.Textbox(
+                    label="Text Prompt",
+                    placeholder="e.g. 'the chair', 'red car', 'coffee mug'",
+                    scale=3
+                )
+                segment_btn = gr.Button("🎯 Segment", scale=1)
+            gr.Markdown("*Or click directly on the object in the image*")
         with gr.Column():
+            preview = gr.Image(label="Segmentation Preview", type="numpy", interactive=False)
+            status = gr.Textbox(label="Status")
     with gr.Row():
+        reconstruct_btn = gr.Button("🚀 Reconstruct 3D", variant="primary", size="lg")
     with gr.Row():
+        output_model = gr.Model3D(label="3D Preview")
+        output_file = gr.File(label="Download GLB")
+    # Events
+    segment_btn.click(
+        segment_with_text,
+        inputs=[input_image, text_prompt],
+        outputs=[preview, mask_state, status]
+    )
     input_image.select(
+        segment_with_click,
         inputs=[input_image],
+        outputs=[preview, mask_state, status]
     )
+    reconstruct_btn.click(
+        reconstruct_3d,
         inputs=[input_image, mask_state],
         outputs=[output_file, status]
     )
     ---
     ### MCP Server
     ```json
+    {
+      "mcpServers": {
+        "sam3d-objects": {
+          "command": "npx",
+          "args": ["mcp-remote", "https://dev-bjoern-sam3d-objects-mcp.hf.space/gradio_api/mcp/sse"]
+        }
+      }
+    }
     ```
     """)