Spaces:

prithivMLmods
/

SAM3-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 20 days ago

Commit

bc27759

verified ·

1 Parent(s): 4b1c88a

Update app.py

Browse files

Files changed (1) hide show

app.py +193 -174

app.py CHANGED Viewed

@@ -1,18 +1,23 @@
 import os
 import spaces
 import gradio as gr
 import numpy as np
 import torch
-import random
-from PIL import Image, ImageDraw
 from typing import Iterable
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
-from transformers import Sam3Processor, Sam3Model, Sam3VideoModel, Sam3VideoProcessor
-import cv2
-import tempfile
-# --- Theme Definition ---
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
@@ -28,7 +33,7 @@ colors.steel_blue = colors.Color(
     c950="#1E3450",
 )
-class SteelBlueTheme(Soft):
     def __init__(
         self,
         *,
@@ -73,215 +78,229 @@ class SteelBlueTheme(Soft):
             block_label_background_fill="*primary_200",
         )
-steel_blue_theme = SteelBlueTheme()
-# --- Model Loading ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
-MODELS = {}
-def get_model(model_type):
-    if model_type not in MODELS:
-        if model_type == "sam3_image":
-            print("Loading SAM3 Image Model and Processor...")
-            model = Sam3Model.from_pretrained("facebook/sam3").to(device)
-            processor = Sam3Processor.from_pretrained("facebook/sam3")
-            MODELS[model_type] = (model, processor)
-        elif model_type == "sam3_video_text":
-            print("Loading SAM3 Video Model and Processor...")
-            model = Sam3VideoModel.from_pretrained("facebook/sam3").to(device, dtype=torch.bfloat16)
-            processor = Sam3VideoProcessor.from_pretrained("facebook/sam3")
-            MODELS[model_type] = (model, processor)
-    return MODELS[model_type]
-try:
-    get_model("sam3_image")
-    print("Image model loaded successfully.")
-except Exception as e:
-    print(f"Error loading image model: {e}")
-    print("Ensure you have the correct libraries installed and access to the model.")
-# --- Helper Functions ---
-def overlay_masks(image, masks, alpha=0.5):
-    """ Overlays masks on the image with random colors. """
-    image = image.convert("RGBA")
-    overlay = Image.new("RGBA", image.size, (0, 0, 0, 0))
-    draw = ImageDraw.Draw(overlay)
-    for mask in masks:
-        # Generate a random color for each mask
-        color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), int(255 * alpha))
-        # Convert boolean mask to an image that can be pasted
-        mask_pil = Image.fromarray((mask * 255).astype(np.uint8), mode='L')
-        # Draw the colored mask
-        draw.bitmap((0, 0), mask_pil, fill=color)
-    # Combine the original image with the overlay
-    combined = Image.alpha_composite(image, overlay)
-    return combined.convert("RGB")
-# --- Core Functions ---
 @spaces.GPU
-def segment_image(input_image, text_prompt, threshold=0.5):
-    if input_image is None:
-        raise gr.Error("Please upload an image.")
-    if not text_prompt:
-        raise gr.Error("Please enter a text prompt (e.g., 'cat', 'face').")
     try:
-        model, processor = get_model("sam3_image")
-    except Exception as e:
-        raise gr.Error(f"Model not loaded correctly: {e}")
-    image_pil = input_image.convert("RGB")
-    inputs = processor(images=image_pil, text=text_prompt, return_tensors="pt").to(device)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    results = processor.post_process_instance_segmentation(
-        outputs,
-        threshold=threshold,
-        mask_threshold=0.5,
-        target_sizes=inputs.get("original_sizes").tolist()
-    )[0]
-    masks = results['masks']
-    scores = results['scores']
-    annotations = []
-    masks_np = masks.cpu().numpy()
-    scores_np = scores.cpu().numpy()
-    for i, mask in enumerate(masks_np):
-        score_val = scores_np[i]
-        label = f"{text_prompt} ({score_val:.2f})"
-        annotations.append((mask, label))
-    return (image_pil, annotations)
-@spaces.GPU
-def process_video_text(video_path, text_prompt, max_frames, timeout_seconds):
-    if not video_path or not text_prompt:
-        return None, "Missing video or prompt."
     try:
-        model, processor = get_model("sam3_video_text")
-        cap = cv2.VideoCapture(video_path)
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = []
-        frame_count = 0
-        while cap.isOpened():
-            ret, frame = cap.read()
-            if not ret or (max_frames > 0 and frame_count >= max_frames):
-                break
-            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-            frame_count += 1
-        cap.release()
-        inference_session = processor.init_video_session(video=frames, inference_device=device, dtype=torch.bfloat16)
-        inference_session = processor.add_text_prompt(inference_session=inference_session, text=text_prompt)
-        output_path = tempfile.mktemp(suffix=".mp4")
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
-        for model_outputs in model.propagate_in_video_iterator(inference_session=inference_session, max_frame_num_to_track=len(frames)):
-            processed_outputs = processor.postprocess_outputs(inference_session, model_outputs)
-            frame_idx = model_outputs.frame_idx
-            orig_frame = Image.fromarray(frames[frame_idx])
-            if 'masks' in processed_outputs:
-                masks = processed_outputs['masks']
-                if masks.ndim == 4:
-                    masks = masks.squeeze(1)
-                res_frame = overlay_masks(orig_frame, masks)
-            else:
-                res_frame = orig_frame
-            out.write(cv2.cvtColor(np.array(res_frame), cv2.COLOR_RGB2BGR))
-        out.release()
-        return output_path, "Done!"
     except Exception as e:
-        return None, f"Error: {str(e)}"
-# --- Gradio UI ---
-css="""
-#col-container {
-    margin: 0 auto;
-    max-width: 980px;
-}
-#main-title h1 {font-size: 2.1em !important;}
 """
-with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     with gr.Column(elem_id="col-container"):
-        gr.Markdown(
-            "# **SAM3 Image & Video Segmentation**",
-            elem_id="main-title"
-        )
-        gr.Markdown("Segment objects in images or videos using **SAM3** (Segment Anything Model 3) with text prompts.")
         with gr.Tabs():
-            with gr.TabItem("Image Segmentation"):
                 with gr.Row():
                     with gr.Column(scale=1):
-                        input_image = gr.Image(label="Input Image", type="pil", height=300)
-                        text_prompt = gr.Textbox(
-                            label="Text Prompt",
-                            placeholder="e.g., cat, ear, car wheel...",
-                        )
-                        run_button = gr.Button("Segment Image", variant="primary")
                     with gr.Column(scale=1.5):
-                        output_image = gr.AnnotatedImage(label="Segmented Output", height=380)
-                        with gr.Row():
-                            threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.05)
-                gr.Examples(
-                    examples=[
-                        ["examples/player.jpg", "player in white", 0.5],
-                        ["examples/goldencat.webp", "black cat", 0.4],
-                        ["examples/taxi.jpg", "blue taxi", 0.5],
-                    ],
-                    inputs=[input_image, text_prompt, threshold],
-                    outputs=[output_image],
-                    fn=segment_image,
-                    cache_examples="lazy",
-                    label="Image Examples"
                 )
-            with gr.TabItem("Video Segmentation"):
                 with gr.Row():
                     with gr.Column():
-                        input_video = gr.Video(label="Input Video", format="mp4")
-                        video_text_prompt = gr.Textbox(label="Text Prompt", placeholder="e.g.: person, car")
-                        max_frames_slider = gr.Slider(10, 1000, value=50, step=10, label="Max Frames to Process")
-                        processing_duration = gr.Radio([60, 120], value=60, label="Max Processing Time (seconds)", info="Choose 60s for short clips, 120s for complex tasks")
-                        start_video_segmentation_button = gr.Button("Start Video Segmentation", variant="primary")
                     with gr.Column():
-                        output_video = gr.Video(label="Result Video")
-                        status_textbox = gr.Textbox(label="Status")
-                start_video_segmentation_button.click(
-                    process_video_text,
-                    [input_video, video_text_prompt, max_frames_slider, processing_duration],
-                    [output_video, status_textbox]
                 )
-    run_button.click(
-        fn=segment_image,
-        inputs=[input_image, text_prompt, threshold],
-        outputs=[output_image]
-    )
 if __name__ == "__main__":
-    demo.launch(debug=True, show_error=True)

 import os
+import gc
+import cv2
+import tempfile
 import spaces
 import gradio as gr
 import numpy as np
 import torch
+import matplotlib
+import matplotlib.pyplot as plt
+from PIL import Image
 from typing import Iterable
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+from transformers import (
+    Sam3Model, Sam3Processor,
+    Sam3VideoModel, Sam3VideoProcessor
+)
+# --- THEME CONFIGURATION ---
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
     c950="#1E3450",
 )
+class CustomBlueTheme(Soft):
     def __init__(
         self,
         *,
             block_label_background_fill="*primary_200",
         )
+app_theme = CustomBlueTheme()
+# --- MODEL MANAGEMENT & UTILS ---
+MODEL_CACHE = {}
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using compute device: {device}")
+def clear_vram():
+    """Forces RAM/VRAM cleanup."""
+    if MODEL_CACHE:
+        print("🧹 Cleaning up memory...")
+        MODEL_CACHE.clear()
+    gc.collect()
+    torch.cuda.empty_cache()
+def load_segmentation_model(model_key):
+    """Lazy loads the specific SAM3 model required."""
+    if model_key in MODEL_CACHE:
+        return MODEL_CACHE[model_key]
+    clear_vram()
+    print(f"⏳ Loading {model_key}...")
+    try:
+        if model_key == "img_seg_model":
+            # Using generic internal names
+            seg_model = Sam3Model.from_pretrained("facebook/sam3").to(device)
+            seg_processor = Sam3Processor.from_pretrained("facebook/sam3")
+            MODEL_CACHE[model_key] = (seg_model, seg_processor)
+        elif model_key == "vid_seg_model":
+            vid_model = Sam3VideoModel.from_pretrained("facebook/sam3").to(device, dtype=torch.bfloat16)
+            vid_processor = Sam3VideoProcessor.from_pretrained("facebook/sam3")
+            MODEL_CACHE[model_key] = (vid_model, vid_processor)
+        print(f"✅ {model_key} loaded.")
+        return MODEL_CACHE[model_key]
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        clear_vram()
+        raise e
+def apply_mask_overlay(base_image, mask_data, opacity=0.5):
+    """Draws segmentation masks on top of an image."""
+    if isinstance(base_image, np.ndarray):
+        base_image = Image.fromarray(base_image)
+    base_image = base_image.convert("RGBA")
+    if mask_data is None or len(mask_data) == 0:
+        return base_image.convert("RGB")
+    if isinstance(mask_data, torch.Tensor):
+        mask_data = mask_data.cpu().numpy()
+    mask_data = mask_data.astype(np.uint8)
+    # Handle dimensions
+    if mask_data.ndim == 4: mask_data = mask_data[0]
+    if mask_data.ndim == 3 and mask_data.shape[0] == 1: mask_data = mask_data[0]
+    num_masks = mask_data.shape[0] if mask_data.ndim == 3 else 1
+    if mask_data.ndim == 2:
+        mask_data = [mask_data]
+        num_masks = 1
+    try:
+        color_map = matplotlib.colormaps["rainbow"].resampled(max(num_masks, 1))
+    except AttributeError:
+        import matplotlib.cm as cm
+        color_map = cm.get_cmap("rainbow").resampled(max(num_masks, 1))
+    rgb_colors = [tuple(int(c * 255) for c in color_map(i)[:3]) for i in range(num_masks)]
+    composite_layer = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
+    for i, single_mask in enumerate(mask_data):
+        mask_bitmap = Image.fromarray((single_mask * 255).astype(np.uint8))
+        if mask_bitmap.size != base_image.size:
+            mask_bitmap = mask_bitmap.resize(base_image.size, resample=Image.NEAREST)
+        fill_color = rgb_colors[i]
+        color_fill = Image.new("RGBA", base_image.size, fill_color + (0,))
+        mask_alpha = mask_bitmap.point(lambda v: int(v * opacity) if v > 0 else 0)
+        color_fill.putalpha(mask_alpha)
+        composite_layer = Image.alpha_composite(composite_layer, color_fill)
+    return Image.alpha_composite(base_image, composite_layer).convert("RGB")
 @spaces.GPU
+def run_image_segmentation(source_img, text_query, conf_thresh=0.5):
+    if source_img is None or not text_query:
+        raise gr.Error("Please provide an image and a text prompt.")
     try:
+        active_model, active_processor = load_segmentation_model("img_seg_model")
+        pil_image = source_img.convert("RGB")
+        model_inputs = active_processor(images=pil_image, text=text_query, return_tensors="pt").to(device)
+        with torch.no_grad():
+            inference_output = active_model(**model_inputs)
+        processed_results = active_processor.post_process_instance_segmentation(
+            inference_output,
+            threshold=conf_thresh,
+            mask_threshold=0.5,
+            target_sizes=model_inputs.get("original_sizes").tolist()
+        )[0]
+        annotation_list = []
+        raw_masks = processed_results['masks'].cpu().numpy()
+        raw_scores = processed_results['scores'].cpu().numpy()
+        for idx, mask_array in enumerate(raw_masks):
+            label_str = f"{text_query} ({raw_scores[idx]:.2f})"
+            annotation_list.append((mask_array, label_str))
+        return (pil_image, annotation_list)
+    except Exception as e:
+        raise gr.Error(f"Error during image processing: {e}")
+def calc_timeout_duration(vid_file, *args):
+    return args[-1] if args else 60
+@spaces.GPU(duration=calc_timeout_duration)
+def run_video_segmentation(source_vid, text_query, frame_limit, time_limit):
+    if not source_vid or not text_query:
+        raise gr.Error("Missing video or prompt.")
     try:
+        active_model, active_processor = load_segmentation_model("vid_seg_model")
+        video_cap = cv2.VideoCapture(source_vid)
+        vid_fps = video_cap.get(cv2.CAP_PROP_FPS)
+        vid_w = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        vid_h = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        video_frames = []
+        counter = 0
+        while video_cap.isOpened():
+            ret, frame = video_cap.read()
+            if not ret or (frame_limit > 0 and counter >= frame_limit): break
+            video_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            counter += 1
+        video_cap.release()
+        session = active_processor.init_video_session(video=video_frames, inference_device=device, dtype=torch.bfloat16)
+        session = active_processor.add_text_prompt(inference_session=session, text=text_query)
+        temp_out_path = tempfile.mktemp(suffix=".mp4")
+        video_writer = cv2.VideoWriter(temp_out_path, cv2.VideoWriter_fourcc(*'mp4v'), vid_fps, (vid_w, vid_h))
+        for model_out in active_model.propagate_in_video_iterator(inference_session=session, max_frame_num_to_track=len(video_frames)):
+            post_processed = active_processor.postprocess_outputs(session, model_out)
+            f_idx = model_out.frame_idx
+            original_pil = Image.fromarray(video_frames[f_idx])
+            if 'masks' in post_processed:
+                detected_masks = post_processed['masks']
+                if detected_masks.ndim == 4: detected_masks = detected_masks.squeeze(1)
+                final_frame = apply_mask_overlay(original_pil, detected_masks)
+            else:
+                final_frame = original_pil
+            video_writer.write(cv2.cvtColor(np.array(final_frame), cv2.COLOR_RGB2BGR))
+        video_writer.release()
+        return temp_out_path, "Video processing completed successfully."
     except Exception as e:
+        return None, f"Error during video processing: {str(e)}"
+# --- GUI ---
+custom_css="""
+#col-container { margin: 0 auto; max-width: 1100px; }
+#main-title h1 { font-size: 2.1em !important; }
 """
+with gr.Blocks(css=custom_css, theme=app_theme) as main_interface:
     with gr.Column(elem_id="col-container"):
+        gr.Markdown("# **SAM3 **", elem_id="main-title")
         with gr.Tabs():
+            with gr.Tab("Image Segmentation"):
                 with gr.Row():
                     with gr.Column(scale=1):
+                        image_input = gr.Image(label="Source Image", type="pil", height=350)
+                        txt_prompt_img = gr.Textbox(label="Text Description", placeholder="e.g., cat, face, car wheel")
+                        with gr.Accordion("Advanced Settings", open=False):
+                            conf_slider = gr.Slider(0.0, 1.0, value=0.45, step=0.05, label="Confidence Threshold")
+                        btn_process_img = gr.Button("Segment Image", variant="primary")
                     with gr.Column(scale=1.5):
+                        image_result = gr.AnnotatedImage(label="Segmented Result", height=450)
+                btn_process_img.click(
+                    fn=run_image_segmentation,
+                    inputs=[image_input, txt_prompt_img, conf_slider],
+                    outputs=[image_result]
                 )
+            with gr.Tab("Video Segmentation"):
                 with gr.Row():
                     with gr.Column():
+                        video_input = gr.Video(label="Source Video", format="mp4")
+                        txt_prompt_vid = gr.Textbox(label="Text Description", placeholder="e.g., person running, red car")
+                        with gr.Row():
+                            frame_limiter = gr.Slider(10, 500, value=60, step=10, label="Max Frames")
+                            time_limiter = gr.Radio([60, 120, 180], value=60, label="Timeout (seconds)")
+                        btn_process_vid = gr.Button("Segment Video", variant="primary")
                     with gr.Column():
+                        video_result = gr.Video(label="Processed Video")
+                        process_status = gr.Textbox(label="System Status", interactive=False)
+                btn_process_vid.click(
+                    run_video_segmentation,
+                    inputs=[video_input, txt_prompt_vid, frame_limiter, time_limiter],
+                    outputs=[video_result, process_status]
                 )
 if __name__ == "__main__":
+    main_interface.launch(ssr_mode=False, show_error=True)