Spaces:

dev-bjoern
/

sam3d-objects-mcp

Running on Zero

App Files Files Community

dev-bjoern commited on Dec 8, 2025

Commit

cd9cd46

1 Parent(s): c82fe65

Auto object detection: SAM3 finds objects automatically, no click needed

Browse files

Files changed (1) hide show

app.py +55 -173

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
 SAM 3D Objects MCP Server
-Image + Text/Click → 3D Object (GLB)
-Uses SAM3 for segmentation and SAM 3D Objects for 3D reconstruction.
 """
 import os
 import sys
@@ -36,27 +36,29 @@ sys.path.insert(0, str(SAM3D_PATH))
 # Global models
 SAM3D_MODEL = None
-SAM3_PREDICTOR = None
 def load_sam3():
-    """Load SAM3 for segmentation"""
-    global SAM3_PREDICTOR
-    if SAM3_PREDICTOR is not None:
-        return SAM3_PREDICTOR
     import torch
-    from sam3.model_builder import build_sam3_image_model
-    from sam3.model.sam3_image_processor import Sam3Processor
     print("Loading SAM3 model...")
-    model = build_sam3_image_model()
-    SAM3_PREDICTOR = Sam3Processor(model)
     print("✓ SAM3 loaded")
-    return SAM3_PREDICTOR
 def load_sam3d():
@@ -83,142 +85,58 @@ def load_sam3d():
     return SAM3D_MODEL
-@spaces.GPU(duration=60)
-def segment_with_text(image: np.ndarray, text_prompt: str):
-    """Segment object using text prompt with SAM3"""
-    if image is None:
-        return None, None, "❌ No image provided"
-    if not text_prompt:
-        return None, None, "❌ No text prompt provided"
-    try:
-        from PIL import Image as PILImage
-        processor = load_sam3()
-        # Convert to PIL
-        if isinstance(image, np.ndarray):
-            pil_image = PILImage.fromarray(image)
-        else:
-            pil_image = image
-        # Run SAM3 with text prompt
-        state = processor.set_image(pil_image)
-        output = processor.set_text_prompt(state=state, prompt=text_prompt)
-        if output is None or "masks" not in output:
-            return image, None, "⚠️ No object found"
-        masks = output["masks"]
-        scores = output.get("scores", [1.0])
-        if len(masks) == 0:
-            return image, None, "⚠️ No object found"
-        # Use best mask
-        best_idx = np.argmax(scores) if len(scores) > 0 else 0
-        mask = np.array(masks[best_idx])
-        # Create overlay
-        overlay = image.copy()
-        overlay[mask > 0] = (overlay[mask > 0] * 0.5 + np.array([0, 255, 0]) * 0.5).astype(np.uint8)
-        return overlay, (mask > 0).astype(np.uint8) * 255, f"✓ Found: {text_prompt}"
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        return image, None, f"❌ Error: {e}"
-def handle_click(image, evt: gr.SelectData):
-    """Handle click event and extract coordinates"""
-    if image is None or evt is None:
-        return None, None, None, "❌ Click on an image first"
-    # Store coordinates and pass to GPU function
-    x, y = evt.index[0], evt.index[1]
-    return image, x, y, "Processing..."
-@spaces.GPU(duration=60)
-def segment_with_point(image: np.ndarray, x: int, y: int):
-    """Segment object at point with SAM3"""
     if image is None:
         return None, None, "❌ No image provided"
-    if x is None or y is None:
-        return None, None, "❌ No point selected"
     try:
         from PIL import Image as PILImage
-        processor = load_sam3()
-        # Convert to PIL
         if isinstance(image, np.ndarray):
             pil_image = PILImage.fromarray(image)
         else:
             pil_image = image
-        # Run SAM3 with point prompt
-        state = processor.set_image(pil_image)
-        output = processor.set_point_prompt(state=state, points=[[x, y]], labels=[1])
-        if output is None or "masks" not in output:
-            return image, None, "⚠️ No object found"
-        masks = output["masks"]
-        scores = output.get("scores", [1.0])
-        if len(masks) == 0:
-            return image, None, "⚠️ No object found"
-        # Use best mask
-        best_idx = np.argmax(scores) if len(scores) > 0 else 0
-        mask = np.array(masks[best_idx])
-        # Create overlay
-        overlay = image.copy()
-        overlay[mask > 0] = (overlay[mask > 0] * 0.5 + np.array([0, 255, 0]) * 0.5).astype(np.uint8)
-        return overlay, (mask > 0).astype(np.uint8) * 255, "✓ Object selected"
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        return image, None, f"❌ Error: {e}"
-@spaces.GPU(duration=120)
-def reconstruct_3d(image: np.ndarray, mask: np.ndarray):
-    """
-    Reconstruct 3D object from image and mask.
-    Args:
-        image: Input RGB image
-        mask: Binary mask from SAM3
-    Returns:
-        tuple: (glb_path, status)
-    """
-    if image is None:
-        return None, "❌ No image provided"
-    if mask is None:
-        return None, "❌ No mask - segment object first"
-    try:
-        import torch
-        import trimesh
-        model = load_sam3d()
-        # Ensure mask is binary
-        if len(mask.shape) == 3:
-            mask = mask[:, :, 0]
-        mask = (mask > 127).astype(np.uint8)
-        # Run 3D reconstruction
-        outputs = model.predict(image, mask)
         if outputs is None:
-            return None, "⚠️ Reconstruction failed"
         # Export as GLB
         output_dir = tempfile.mkdtemp()
@@ -231,12 +149,12 @@ def reconstruct_3d(image: np.ndarray, mask: np.ndarray):
         cloud = trimesh.PointCloud(vertices)
         cloud.export(glb_path, file_type='glb')
-        return glb_path, f"✓ Reconstructed ({len(vertices)} points)"
     except Exception as e:
         import traceback
         traceback.print_exc()
-        return None, f"❌ Error: {e}"
 # Gradio Interface
@@ -245,64 +163,28 @@ with gr.Blocks(title="SAM 3D Objects MCP") as demo:
     # 📦 SAM 3D Objects MCP Server
     **Image → 3D Object (GLB)**
-    1. Upload image
-    2. Segment: Type what to select OR click on object
-    3. Reconstruct 3D
     """)
-    mask_state = gr.State(None)
-    click_x = gr.State(None)
-    click_y = gr.State(None)
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(label="Input Image", type="numpy")
-            with gr.Row():
-                text_prompt = gr.Textbox(
-                    label="Text Prompt",
-                    placeholder="e.g. 'the chair', 'red car', 'coffee mug'",
-                    scale=3
-                )
-                segment_btn = gr.Button("🎯 Segment", scale=1)
-            gr.Markdown("*Or click directly on the object in the image*")
         with gr.Column():
-            preview = gr.Image(label="Segmentation Preview", type="numpy", interactive=False)
             status = gr.Textbox(label="Status")
-    with gr.Row():
-        reconstruct_btn = gr.Button("🚀 Reconstruct 3D", variant="primary", size="lg")
     with gr.Row():
         with gr.Column():
             output_model = gr.Model3D(label="3D Preview")
         with gr.Column():
             output_file = gr.File(label="Download GLB")
-    # Events
-    segment_btn.click(
-        segment_with_text,
-        inputs=[input_image, text_prompt],
-        outputs=[preview, mask_state, status]
-    )
-    # Click handler: first extract coordinates (no GPU), then segment (GPU)
-    input_image.select(
-        handle_click,
         inputs=[input_image],
-        outputs=[input_image, click_x, click_y, status]
-    ).then(
-        segment_with_point,
-        inputs=[input_image, click_x, click_y],
-        outputs=[preview, mask_state, status]
-    )
-    reconstruct_btn.click(
-        reconstruct_3d,
-        inputs=[input_image, mask_state],
-        outputs=[output_model, status]
     )
     output_model.change(lambda x: x, inputs=[output_model], outputs=[output_file])

 """
 SAM 3D Objects MCP Server
+Image → 3D Object (GLB)
+Automatic object detection with SAM3 + 3D reconstruction with SAM 3D Objects.
 """
 import os
 import sys
 # Global models
 SAM3D_MODEL = None
+SAM3_GENERATOR = None
 def load_sam3():
+    """Load SAM3 automatic mask generator"""
+    global SAM3_GENERATOR
+    if SAM3_GENERATOR is not None:
+        return SAM3_GENERATOR
     import torch
+    from sam3.automatic_mask_generator import SAM3AutomaticMaskGenerator
+    from sam3.model_builder import build_sam3
     print("Loading SAM3 model...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    sam3_model = build_sam3(device=device)
+    SAM3_GENERATOR = SAM3AutomaticMaskGenerator(sam3_model)
     print("✓ SAM3 loaded")
+    return SAM3_GENERATOR
 def load_sam3d():
     return SAM3D_MODEL
+@spaces.GPU(duration=120)
+def reconstruct_objects(image: np.ndarray):
+    """
+    Automatically detect and reconstruct 3D objects from image.
+    Args:
+        image: Input RGB image
+    Returns:
+        tuple: (glb_path, preview_image, status)
+    """
     if image is None:
         return None, None, "❌ No image provided"
     try:
+        import torch
+        import trimesh
         from PIL import Image as PILImage
+        # Load models
+        generator = load_sam3()
+        sam3d = load_sam3d()
+        # Convert to PIL if needed
         if isinstance(image, np.ndarray):
             pil_image = PILImage.fromarray(image)
         else:
             pil_image = image
+            image = np.array(pil_image)
+        # Auto-detect all objects
+        print("Detecting objects...")
+        masks = generator.generate(pil_image)
+        if not masks or len(masks) == 0:
+            return None, image, "⚠️ No objects detected"
+        # Sort by area, take largest object
+        masks = sorted(masks, key=lambda x: x['area'], reverse=True)
+        best_mask = masks[0]['segmentation']
+        # Create preview with mask overlay
+        preview = image.copy()
+        preview[best_mask] = (preview[best_mask] * 0.5 + np.array([0, 255, 0]) * 0.5).astype(np.uint8)
+        # Run 3D reconstruction on largest object
+        print("Reconstructing 3D...")
+        mask_uint8 = best_mask.astype(np.uint8)
+        outputs = sam3d.predict(image, mask_uint8)
         if outputs is None:
+            return None, preview, "⚠️ 3D reconstruction failed"
         # Export as GLB
         output_dir = tempfile.mkdtemp()
         cloud = trimesh.PointCloud(vertices)
         cloud.export(glb_path, file_type='glb')
+        return glb_path, preview, f"✓ Detected {len(masks)} objects, reconstructed largest ({len(vertices)} points)"
     except Exception as e:
         import traceback
         traceback.print_exc()
+        return None, None, f"❌ Error: {e}"
 # Gradio Interface
     # 📦 SAM 3D Objects MCP Server
     **Image → 3D Object (GLB)**
+    Automatically detects objects and reconstructs the largest one in 3D.
     """)
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(label="Input Image", type="numpy")
+            btn = gr.Button("🚀 Detect & Reconstruct", variant="primary", size="lg")
         with gr.Column():
+            preview = gr.Image(label="Detected Object", type="numpy", interactive=False)
             status = gr.Textbox(label="Status")
     with gr.Row():
         with gr.Column():
             output_model = gr.Model3D(label="3D Preview")
         with gr.Column():
             output_file = gr.File(label="Download GLB")
+    btn.click(
+        reconstruct_objects,
         inputs=[input_image],
+        outputs=[output_model, preview, status]
     )
     output_model.change(lambda x: x, inputs=[output_model], outputs=[output_file])