Spaces:

prithivMLmods
/

SAM3-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on Nov 25, 2025

Commit

5a34082

verified ·

1 Parent(s): f17e9df

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -75

app.py CHANGED Viewed

@@ -1,15 +1,43 @@
 import os
 import spaces
 import gradio as gr
 import numpy as np
 import torch
-import random
-from PIL import Image, ImageDraw
 from typing import Iterable
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 from transformers import Sam3Processor, Sam3Model
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
@@ -75,119 +103,208 @@ steel_blue_theme = SteelBlueTheme()
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 try:
     print("Loading SAM3 Model and Processor...")
-    model = Sam3Model.from_pretrained("facebook/sam3").to(device)
-    processor = Sam3Processor.from_pretrained("facebook/sam3")
-    print("Model loaded successfully.")
 except Exception as e:
-    print(f"Error loading model: {e}")
-    print("Ensure you have the correct libraries installed and access to the model.")
-    # Fallback/Placeholder for demonstration if model doesn't exist in environment yet
-    model = None
-    processor = None
 @spaces.GPU
 def segment_image(input_image, text_prompt, threshold=0.5):
     if input_image is None:
         raise gr.Error("Please upload an image.")
     if not text_prompt:
-        raise gr.Error("Please enter a text prompt (e.g., 'cat', 'face').")
-    if model is None or processor is None:
-        raise gr.Error("Model not loaded correctly.")
-    # Convert image to RGB
     image_pil = input_image.convert("RGB")
-    # Preprocess
-    inputs = processor(images=image_pil, text=text_prompt, return_tensors="pt").to(device)
-    # Inference
     with torch.no_grad():
-        outputs = model(**inputs)
-    # Post-process results
-    results = processor.post_process_instance_segmentation(
         outputs,
         threshold=threshold,
         mask_threshold=0.5,
         target_sizes=inputs.get("original_sizes").tolist()
     )[0]
-    masks = results['masks'] # Boolean tensor [N, H, W]
-    scores = results['scores']
-    # Prepare for Gradio AnnotatedImage
-    # Gradio expects (image, [(mask, label), ...])
     annotations = []
-    masks_np = masks.cpu().numpy()
-    scores_np = scores.cpu().numpy()
-    for i, mask in enumerate(masks_np):
-        # mask is a boolean array (True/False).
-        # AnnotatedImage handles the coloring automatically.
-        # We just pass the mask and a label.
-        score_val = scores_np[i]
-        label = f"{text_prompt} ({score_val:.2f})"
         annotations.append((mask, label))
-    # Return tuple format for AnnotatedImage
     return (image_pil, annotations)
-css="""
 #col-container {
     margin: 0 auto;
-    max-width: 980px;
 }
-#main-title h1 {font-size: 2.1em !important;}
 """
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     with gr.Column(elem_id="col-container"):
-        gr.Markdown(
-            "# **SAM3 Image Segmentation**",
-            elem_id="main-title"
-        )
-        gr.Markdown("Segment objects in images using **SAM3** (Segment Anything Model 3) with text prompts.")
-        with gr.Row():
-            with gr.Column(scale=1):
-                input_image = gr.Image(label="Input Image", type="pil", height=300)
-                text_prompt = gr.Textbox(
-                    label="Text Prompt",
-                    placeholder="e.g., cat, ear, car wheel...",
                 )
-                run_button = gr.Button("Segment", variant="primary")
-            with gr.Column(scale=1.5):
-                output_image = gr.AnnotatedImage(label="Segmented Output", height=380)
                 with gr.Row():
-                    threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.05)
-        gr.Examples(
-            examples=[
-                ["examples/player.jpg", "player in white", 0.5],
-                ["examples/goldencat.webp", "black cat", 0.4],
-                ["examples/taxi.jpg", "blue taxi", 0.5],
-            ],
-            inputs=[input_image, text_prompt, threshold],
-            outputs=[output_image],
-            fn=segment_image,
-            cache_examples="lazy",
-            label="Examples"
-        )
-    run_button.click(
-        fn=segment_image,
-        inputs=[input_image, text_prompt, threshold],
-        outputs=[output_image]
-    )
 if __name__ == "__main__":
     demo.launch(mcp_server=True, ssr_mode=False, show_error=True)

 import os
+import sys
 import spaces
 import gradio as gr
 import numpy as np
 import torch
+import cv2
+import tempfile
+import shutil
+from PIL import Image
 from typing import Iterable
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 from transformers import Sam3Processor, Sam3Model
+# ---------------------------------------------------------
+# 1. SETUP PATHS & CUSTOM IMPORTS
+# ---------------------------------------------------------
+# Attempt to import the specific utils provided in your snippet
+try:
+    # Adjust path to find utils.py (assuming it's in parent dir based on your snippet)
+    parent_dir = os.path.dirname(os.getcwd())
+    if parent_dir not in sys.path:
+        sys.path.insert(0, parent_dir)
+    from utils import (
+        setup_sam_3d_body,
+        setup_visualizer,
+        visualize_2d_results,
+        visualize_3d_mesh,
+        save_mesh_results
+    )
+    SAM3D_AVAILABLE = True
+except ImportError as e:
+    print(f"Warning: SAM 3D Body utils not found ({e}). The 3D Body tab will use placeholder logic.")
+    SAM3D_AVAILABLE = False
+# ---------------------------------------------------------
+# 2. THEME DEFINITION
+# ---------------------------------------------------------
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+# ---------------------------------------------------------
+# 3. MODEL LOADING
+# ---------------------------------------------------------
+# --- Load SAM3 (Segmentation) ---
 try:
     print("Loading SAM3 Model and Processor...")
+    sam3_model = Sam3Model.from_pretrained("facebook/sam3").to(device)
+    sam3_processor = Sam3Processor.from_pretrained("facebook/sam3")
+    print("SAM3 Model loaded successfully.")
 except Exception as e:
+    print(f"Error loading SAM3 model: {e}")
+    sam3_model = None
+    sam3_processor = None
+# --- Load SAM 3D Body ---
+sam3d_estimator = None
+sam3d_visualizer = None
+if SAM3D_AVAILABLE:
+    try:
+        print("Loading SAM 3D Body Estimator...")
+        sam3d_estimator = setup_sam_3d_body(hf_repo_id="facebook/sam-3d-body-dinov3")
+        sam3d_visualizer = setup_visualizer()
+        print("SAM 3D Body Model loaded successfully.")
+    except Exception as e:
+        print(f"Error loading SAM 3D Body model: {e}")
+# ---------------------------------------------------------
+# 4. INFERENCE FUNCTIONS
+# ---------------------------------------------------------
 @spaces.GPU
 def segment_image(input_image, text_prompt, threshold=0.5):
+    """Function for Tab 1: SAM3 Segmentation"""
     if input_image is None:
         raise gr.Error("Please upload an image.")
     if not text_prompt:
+        raise gr.Error("Please enter a text prompt.")
+    if sam3_model is None or sam3_processor is None:
+        raise gr.Error("SAM3 Model not loaded correctly.")
     image_pil = input_image.convert("RGB")
+    inputs = sam3_processor(images=image_pil, text=text_prompt, return_tensors="pt").to(device)
     with torch.no_grad():
+        outputs = sam3_model(**inputs)
+    results = sam3_processor.post_process_instance_segmentation(
         outputs,
         threshold=threshold,
         mask_threshold=0.5,
         target_sizes=inputs.get("original_sizes").tolist()
     )[0]
+    masks = results['masks'].cpu().numpy()
+    scores = results['scores'].cpu().numpy()
     annotations = []
+    for i, mask in enumerate(masks):
+        label = f"{text_prompt} ({scores[i]:.2f})"
         annotations.append((mask, label))
     return (image_pil, annotations)
+@spaces.GPU
+def process_3d_body(input_image):
+    """Function for Tab 2: SAM 3D Body"""
+    if input_image is None:
+        raise gr.Error("Please upload an image.")
+    if not SAM3D_AVAILABLE or sam3d_estimator is None:
+        raise gr.Error("SAM 3D Body libraries or model not available.")
+    # Convert PIL to CV2 BGR
+    img_np = np.array(input_image.convert("RGB"))
+    img_cv2 = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
+    # Save temp image for the process_one_image function if it requires a path
+    # (Checking the snippet provided: outputs = estimator.process_one_image(image_path))
+    # We need a physical path.
+    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
+        tmp_path = tmp_file.name
+        cv2.imwrite(tmp_path, img_cv2)
+    try:
+        # Run Inference
+        outputs = sam3d_estimator.process_one_image(tmp_path)
+        if not outputs:
+            return None, None, None, "No people detected."
+        # 1. Generate 2D Visualization
+        vis_results_2d = visualize_2d_results(img_cv2, outputs, sam3d_visualizer)
+        # Taking the first result if multiple people, or combine them
+        # Converting the first result to RGB for display
+        res_2d_rgb = cv2.cvtColor(vis_results_2d[0], cv2.COLOR_BGR2RGB) if vis_results_2d else img_np
+        # 2. Generate 3D Visualization (Overlay Image)
+        mesh_results_img = visualize_3d_mesh(img_cv2, outputs, sam3d_estimator.faces)
+        res_3d_overlay_rgb = cv2.cvtColor(mesh_results_img[0], cv2.COLOR_BGR2RGB) if mesh_results_img else img_np
+        # 3. Save PLY Mesh to temp directory for Gradio Model3D
+        # Create a unique temp dir
+        output_dir = tempfile.mkdtemp()
+        image_name = "person_mesh"
+        # This function saves .ply files
+        ply_files = save_mesh_results(img_cv2, outputs, sam3d_estimator.faces, output_dir, image_name)
+        ply_path = None
+        if ply_files:
+            ply_path = ply_files[0] # Return the first person's mesh
+        status = f"Detected {len(outputs)} person(s)."
+        return res_2d_rgb, res_3d_overlay_rgb, ply_path, status
+    finally:
+        # Cleanup input temp file
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+# ---------------------------------------------------------
+# 5. GRADIO UI LAYOUT
+# ---------------------------------------------------------
+css = """
 #col-container {
     margin: 0 auto;
+    max-width: 1200px;
 }
+#main-title h1 {font-size: 2.1em !important; text-align: center;}
 """
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     with gr.Column(elem_id="col-container"):
+        gr.Markdown("# **SAM Integrated Vision Suite**", elem_id="main-title")
+        with gr.Tabs():
+            # ================= TAB 1: SEGMENTATION =================
+            with gr.Tab("SAM3 Segmentation"):
+                gr.Markdown("Segment objects using **SAM3** with text prompts.")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        t1_input_image = gr.Image(label="Input Image", type="pil", height=350)
+                        t1_text_prompt = gr.Textbox(label="Text Prompt", placeholder="e.g., cat, ear, car wheel...")
+                        t1_threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.05)
+                        t1_run_btn = gr.Button("Segment Image", variant="primary")
+                    with gr.Column(scale=1.5):
+                        t1_output_image = gr.AnnotatedImage(label="Segmented Output", height=450)
+                t1_run_btn.click(
+                    fn=segment_image,
+                    inputs=[t1_input_image, t1_text_prompt, t1_threshold],
+                    outputs=[t1_output_image]
                 )
+                gr.Examples(
+                    examples=[
+                        ["examples/player.jpg", "player", 0.5],
+                        ["examples/goldencat.webp", "cat", 0.4],
+                    ],
+                    inputs=[t1_input_image, t1_text_prompt, t1_threshold],
+                    label="Segmentation Examples"
+                )
+            # ================= TAB 2: 3D BODY =================
+            with gr.Tab("SAM 3D Body"):
+                gr.Markdown("Detect human bodies and reconstruct **3D Meshes**.")
                 with gr.Row():
+                    with gr.Column(scale=1):
+                        t2_input_image = gr.Image(label="Input Image", type="pil", height=350)
+                        t2_run_btn = gr.Button("Generate 3D Body", variant="primary")
+                        t2_status = gr.Textbox(label="Status", interactive=False)
+                    with gr.Column(scale=2):
+                        with gr.Row():
+                            t2_output_2d = gr.Image(label="2D Keypoints", type="numpy")
+                            t2_output_overlay = gr.Image(label="Mesh Overlay", type="numpy")
+                        t2_output_3d = gr.Model3D(
+                            label="Interactive 3D Mesh",
+                            clear_color=[0.0, 0.0, 0.0, 0.0],
+                            camera_position=[0, 0, 3]
+                        )
+                t2_run_btn.click(
+                    fn=process_3d_body,
+                    inputs=[t2_input_image],
+                    outputs=[t2_output_2d, t2_output_overlay, t2_output_3d, t2_status]
+                )
+                # Assuming examples exist in the folder
+                gr.Examples(
+                    examples=[["examples/player.jpg"], ["examples/dancing.jpg"]],
+                    inputs=[t2_input_image],
+                    label="3D Body Examples"
+                )
 if __name__ == "__main__":
     demo.launch(mcp_server=True, ssr_mode=False, show_error=True)