Spaces:

prithivMLmods
/

SAM3-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 28 days ago

Commit

a3f9b81

verified ·

1 Parent(s): 79e7003

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -54

app.py CHANGED Viewed

@@ -8,8 +8,11 @@ from PIL import Image, ImageDraw
 from typing import Iterable
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
-from transformers import Sam3Processor, Sam3Model
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
@@ -72,22 +75,55 @@ class SteelBlueTheme(Soft):
 steel_blue_theme = SteelBlueTheme()
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 try:
-    print("Loading SAM3 Model and Processor...")
-    model = Sam3Model.from_pretrained("facebook/sam3").to(device)
-    processor = Sam3Processor.from_pretrained("facebook/sam3")
-    print("Model loaded successfully.")
 except Exception as e:
-    print(f"Error loading model: {e}")
     print("Ensure you have the correct libraries installed and access to the model.")
-    # Fallback/Placeholder for demonstration if model doesn't exist in environment yet
-    model = None
-    processor = None
 @spaces.GPU
 def segment_image(input_image, text_prompt, threshold=0.5):
     if input_image is None:
@@ -95,20 +131,17 @@ def segment_image(input_image, text_prompt, threshold=0.5):
     if not text_prompt:
         raise gr.Error("Please enter a text prompt (e.g., 'cat', 'face').")
-    if model is None or processor is None:
-        raise gr.Error("Model not loaded correctly.")
-    # Convert image to RGB
     image_pil = input_image.convert("RGB")
-    # Preprocess
     inputs = processor(images=image_pil, text=text_prompt, return_tensors="pt").to(device)
-    # Inference
     with torch.no_grad():
         outputs = model(**inputs)
-    # Post-process results
     results = processor.post_process_instance_segmentation(
         outputs,
         threshold=threshold,
@@ -116,27 +149,67 @@ def segment_image(input_image, text_prompt, threshold=0.5):
         target_sizes=inputs.get("original_sizes").tolist()
     )[0]
-    masks = results['masks'] # Boolean tensor [N, H, W]
     scores = results['scores']
-    # Prepare for Gradio AnnotatedImage
-    # Gradio expects (image, [(mask, label), ...])
     annotations = []
     masks_np = masks.cpu().numpy()
     scores_np = scores.cpu().numpy()
     for i, mask in enumerate(masks_np):
-        # mask is a boolean array (True/False).
-        # AnnotatedImage handles the coloring automatically.
-        # We just pass the mask and a label.
         score_val = scores_np[i]
         label = f"{text_prompt} ({score_val:.2f})"
         annotations.append((mask, label))
-    # Return tuple format for AnnotatedImage
     return (image_pil, annotations)
 css="""
 #col-container {
     margin: 0 auto;
@@ -148,40 +221,60 @@ css="""
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown(
-            "# **SAM3 Image Segmentation**",
             elem_id="main-title"
         )
-        gr.Markdown("Segment objects in images using **SAM3** (Segment Anything Model 3) with text prompts.")
-        with gr.Row():
-            with gr.Column(scale=1):
-                input_image = gr.Image(label="Input Image", type="pil", height=300)
-                text_prompt = gr.Textbox(
-                    label="Text Prompt",
-                    placeholder="e.g., cat, ear, car wheel...",
-                )
-                run_button = gr.Button("Segment", variant="primary")
-            with gr.Column(scale=1.5):
-                output_image = gr.AnnotatedImage(label="Segmented Output", height=380)
                 with gr.Row():
-                    threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.05)
-        gr.Examples(
-            examples=[
-                ["examples/player.jpg", "player in white", 0.5],
-                ["examples/goldencat.webp", "black cat", 0.4],
-                ["examples/taxi.jpg", "blue taxi", 0.5],
-            ],
-            inputs=[input_image, text_prompt, threshold],
-            outputs=[output_image],
-            fn=segment_image,
-            cache_examples="lazy",
-            label="Examples"
-        )
     run_button.click(
         fn=segment_image,
@@ -190,4 +283,4 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     )
 if __name__ == "__main__":
-    demo.launch(mcp_server=True, ssr_mode=False, show_error=True)

 from typing import Iterable
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+from transformers import Sam3Processor, Sam3Model, Sam3VideoModel, Sam3VideoProcessor
+import cv2
+import tempfile
+# --- Theme Definition ---
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
 steel_blue_theme = SteelBlueTheme()
+# --- Model Loading ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+MODELS = {}
+def get_model(model_type):
+    if model_type not in MODELS:
+        if model_type == "sam3_image":
+            print("Loading SAM3 Image Model and Processor...")
+            model = Sam3Model.from_pretrained("facebook/sam3").to(device)
+            processor = Sam3Processor.from_pretrained("facebook/sam3")
+            MODELS[model_type] = (model, processor)
+        elif model_type == "sam3_video_text":
+            print("Loading SAM3 Video Model and Processor...")
+            model = Sam3VideoModel.from_pretrained("facebook/sam3").to(device, dtype=torch.bfloat16)
+            processor = Sam3VideoProcessor.from_pretrained("facebook/sam3")
+            MODELS[model_type] = (model, processor)
+    return MODELS[model_type]
 try:
+    get_model("sam3_image")
+    print("Image model loaded successfully.")
 except Exception as e:
+    print(f"Error loading image model: {e}")
     print("Ensure you have the correct libraries installed and access to the model.")
+# --- Helper Functions ---
+def overlay_masks(image, masks, alpha=0.5):
+    """ Overlays masks on the image with random colors. """
+    image = image.convert("RGBA")
+    overlay = Image.new("RGBA", image.size, (0, 0, 0, 0))
+    draw = ImageDraw.Draw(overlay)
+    for mask in masks:
+        # Generate a random color for each mask
+        color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), int(255 * alpha))
+        # Convert boolean mask to an image that can be pasted
+        mask_pil = Image.fromarray((mask * 255).astype(np.uint8), mode='L')
+        # Draw the colored mask
+        draw.bitmap((0, 0), mask_pil, fill=color)
+    # Combine the original image with the overlay
+    combined = Image.alpha_composite(image, overlay)
+    return combined.convert("RGB")
+# --- Core Functions ---
 @spaces.GPU
 def segment_image(input_image, text_prompt, threshold=0.5):
     if input_image is None:
     if not text_prompt:
         raise gr.Error("Please enter a text prompt (e.g., 'cat', 'face').")
+    try:
+        model, processor = get_model("sam3_image")
+    except Exception as e:
+        raise gr.Error(f"Model not loaded correctly: {e}")
     image_pil = input_image.convert("RGB")
     inputs = processor(images=image_pil, text=text_prompt, return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = model(**inputs)
     results = processor.post_process_instance_segmentation(
         outputs,
         threshold=threshold,
         target_sizes=inputs.get("original_sizes").tolist()
     )[0]
+    masks = results['masks']
     scores = results['scores']
     annotations = []
     masks_np = masks.cpu().numpy()
     scores_np = scores.cpu().numpy()
     for i, mask in enumerate(masks_np):
         score_val = scores_np[i]
         label = f"{text_prompt} ({score_val:.2f})"
         annotations.append((mask, label))
     return (image_pil, annotations)
+def process_video_text(video_path, text_prompt, max_frames, timeout_seconds):
+    if not video_path or not text_prompt:
+        return None, "Missing video or prompt."
+    try:
+        model, processor = get_model("sam3_video_text")
+        cap = cv2.VideoCapture(video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = []
+        frame_count = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret or (max_frames > 0 and frame_count >= max_frames):
+                break
+            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            frame_count += 1
+        cap.release()
+        inference_session = processor.init_video_session(video=frames, inference_device=device, dtype=torch.bfloat16)
+        inference_session = processor.add_text_prompt(inference_session=inference_session, text=text_prompt)
+        output_path = tempfile.mktemp(suffix=".mp4")
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+        for model_outputs in model.propagate_in_video_iterator(inference_session=inference_session, max_frame_num_to_track=len(frames)):
+            processed_outputs = processor.postprocess_outputs(inference_session, model_outputs)
+            frame_idx = model_outputs.frame_idx
+            orig_frame = Image.fromarray(frames[frame_idx])
+            if 'masks' in processed_outputs:
+                masks = processed_outputs['masks']
+                if masks.ndim == 4:
+                    masks = masks.squeeze(1)
+                res_frame = overlay_masks(orig_frame, masks)
+            else:
+                res_frame = orig_frame
+            out.write(cv2.cvtColor(np.array(res_frame), cv2.COLOR_RGB2BGR))
+        out.release()
+        return output_path, "Done!"
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+# --- Gradio UI ---
 css="""
 #col-container {
     margin: 0 auto;
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown(
+            "# **SAM3 Image & Video Segmentation**",
             elem_id="main-title"
         )
+        gr.Markdown("Segment objects in images or videos using **SAM3** (Segment Anything Model 3) with text prompts.")
+        with gr.Tabs():
+            with gr.TabItem("Image Segmentation"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        input_image = gr.Image(label="Input Image", type="pil", height=300)
+                        text_prompt = gr.Textbox(
+                            label="Text Prompt",
+                            placeholder="e.g., cat, ear, car wheel...",
+                        )
+                        run_button = gr.Button("Segment Image", variant="primary")
+                    with gr.Column(scale=1.5):
+                        output_image = gr.AnnotatedImage(label="Segmented Output", height=380)
+                        with gr.Row():
+                            threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.05)
+                gr.Examples(
+                    examples=[
+                        ["examples/player.jpg", "player in white", 0.5],
+                        ["examples/goldencat.webp", "black cat", 0.4],
+                        ["examples/taxi.jpg", "blue taxi", 0.5],
+                    ],
+                    inputs=[input_image, text_prompt, threshold],
+                    outputs=[output_image],
+                    fn=segment_image,
+                    cache_examples="lazy",
+                    label="Image Examples"
+                )
+            with gr.TabItem("Video Segmentation"):
                 with gr.Row():
+                    with gr.Column():
+                        input_video = gr.Video(label="Input Video", format="mp4")
+                        video_text_prompt = gr.Textbox(label="Text Prompt", placeholder="e.g.: person, car")
+                        max_frames_slider = gr.Slider(10, 1000, value=50, step=10, label="Max Frames to Process")
+                        processing_duration = gr.Radio([60, 120], value=60, label="Max Processing Time (seconds)", info="Choose 60s for short clips, 120s for complex tasks")
+                        start_video_segmentation_button = gr.Button("Start Video Segmentation", variant="primary")
+                    with gr.Column():
+                        output_video = gr.Video(label="Result Video")
+                        status_textbox = gr.Textbox(label="Status")
+                start_video_segmentation_button.click(
+                    process_video_text,
+                    [input_video, video_text_prompt, max_frames_slider, processing_duration],
+                    [output_video, status_textbox]
+                )
     run_button.click(
         fn=segment_image,
     )
 if __name__ == "__main__":
+    demo.launch(debug=True, show_error=True)