Spaces:

EnginDev
/

Grounded_sam_boostly

Runtime error

App Files Files Community

EnginDev commited on Oct 14, 2025

Commit

6f96e62

verified ·

1 Parent(s): d4de6a5

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -101

app.py CHANGED Viewed

@@ -1,154 +1,165 @@
 import gradio as gr
 import numpy as np
 from PIL import Image
-import torch
-from transformers import pipeline
-from groundingdino.util.inference import load_model, load_image, predict
 from segment_anything import sam_model_registry, SamPredictor
 import supervision as sv
-import cv2
 import os
-# Download models on startup
-print("Loading models...")
-# Load Grounding DINO model from Hugging Face
-# Using a different approach that doesn't require local config files
-from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
-dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
-dino_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny")
-# Load SAM model
-sam_checkpoint = "sam_vit_h_4b8939.pth"
-model_type = "vit_h"
-# Download SAM weights if not present
-if not os.path.exists(sam_checkpoint):
-    os.system(f"wget https://dl.fbaipublicfiles.com/segment_anything/{sam_checkpoint}")
-sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
 sam_predictor = SamPredictor(sam)
-print("Models loaded successfully!")
-def detect_and_segment(image, text_prompt="fish", quality="Medium (512px)"):
     """
-    Detect objects using Grounding DINO and segment using SAM
     """
     try:
-        # Resize image based on quality setting
-        quality_map = {
-            "Low (256px)": 256,
-            "Medium (512px)": 512,
-            "High (1024px)": 1024
-        }
-        target_size = quality_map.get(quality, 512)
-        # Convert PIL to numpy
-        image_np = np.array(image)
-        h, w = image_np.shape[:2]
-        # Resize maintaining aspect ratio
-        scale = min(target_size / w, target_size / h)
-        new_w, new_h = int(w * scale), int(h * scale)
-        image_resized = cv2.resize(image_np, (new_w, new_h))
-        # Prepare image for Grounding DINO
-        inputs = dino_processor(images=image_resized, text=text_prompt, return_tensors="pt")
         with torch.no_grad():
-            outputs = dino_model(**inputs)
         # Post-process results
-        results = dino_processor.post_process_grounded_object_detection(
             outputs,
             inputs.input_ids,
-            box_threshold=0.25,
-            text_threshold=0.25,
-            target_sizes=[(new_h, new_w)]
-        )
-        if len(results) == 0 or len(results[0]["boxes"]) == 0:
-            return image, {"error": "No fish detected", "detections": 0}
-        # Get boxes and scores
-        boxes = results[0]["boxes"].cpu().numpy()
-        scores = results[0]["scores"].cpu().numpy()
-        # Use SAM to segment
-        sam_predictor.set_image(image_resized)
-        # Convert boxes to SAM format
         masks = []
-        for box in boxes:
-            box_sam = np.array([box[0], box[1], box[2], box[3]])
-            mask, _, _ = sam_predictor.predict(box=box_sam, multimask_output=False)
             masks.append(mask[0])
-        # Create visualization
-        annotated_image = image_resized.copy()
         # Draw masks
-        for mask in masks:
-            color_mask = np.zeros_like(annotated_image)
-            color_mask[mask] = [0, 255, 0]  # Green mask
-            annotated_image = cv2.addWeighted(annotated_image, 1, color_mask, 0.5, 0)
-        # Draw bounding boxes
-        for box in boxes:
             x1, y1, x2, y2 = map(int, box)
-            cv2.rectangle(annotated_image, (x1, y1), (x2, y2), (0, 0, 255), 2)
-        # Calculate metadata
-        total_pixels = new_w * new_h
-        mask_pixels = sum(np.sum(mask) for mask in masks)
-        mask_percentage = (mask_pixels / total_pixels) * 100
-        metadata = {
-            "detections": len(boxes),
-            "avg_confidence": float(np.mean(scores)),
-            "image_size": f"{new_w}x{new_h}",
-            "mask_percentage": f"{mask_percentage:.2f}%"
-        }
-        return Image.fromarray(annotated_image), metadata
     except Exception as e:
-        return image, {"error": str(e)}
-# Create Gradio interface
-with gr.Blocks(title="Grounded SAM - Fish Detection") as demo:
-    gr.Markdown("# 🐟 Grounded SAM: Fish Detection & Segmentation")
-    gr.Markdown("Upload an image and detect fish using Grounding DINO + Segment Anything Model")
     with gr.Row():
         with gr.Column():
-            input_image = gr.Image(type="pil", label="Upload Image")
-            text_prompt = gr.Textbox(value="fish", label="Detection Prompt")
-            quality = gr.Radio(
-                choices=["Low (256px)", "Medium (512px)", "High (1024px)"],
-                value="Medium (512px)",
-                label="Processing Quality"
             )
-            submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column():
-            output_image = gr.Image(label="Detection Result")
-            output_metadata = gr.JSON(label="Detection Metadata")
     submit_btn.click(
-        fn=detect_and_segment,
-        inputs=[input_image, text_prompt, quality],
         outputs=[output_image, output_metadata]
     )
     gr.Examples(
         examples=[
-            ["fish_angler.jpg", "fish", "High (1024px)"],
         ],
-        inputs=[input_image, text_prompt, quality]
     )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import torch
 import numpy as np
 from PIL import Image
+import cv2
 from segment_anything import sam_model_registry, SamPredictor
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 import supervision as sv
 import os
+import urllib.request
+# Download SAM checkpoint if not exists
+SAM_CHECKPOINT = "sam_vit_h_4b8939.pth"
+SAM_CHECKPOINT_URL = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
+if not os.path.exists(SAM_CHECKPOINT):
+    print(f"Downloading SAM checkpoint...")
+    urllib.request.urlretrieve(SAM_CHECKPOINT_URL, SAM_CHECKPOINT)
+    print(f"SAM checkpoint downloaded!")
+# Initialize models
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load Grounding DINO from Hugging Face
+grounding_dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
+grounding_dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
+    "IDEA-Research/grounding-dino-tiny"
+).to(device)
+# Load SAM
+sam = sam_model_registry["vit_h"](checkpoint=SAM_CHECKPOINT)
+sam.to(device=device)
 sam_predictor = SamPredictor(sam)
+def process_image(image, text_prompt, box_threshold, text_threshold, quality):
     """
+    Process image with Grounded SAM
     """
     try:
+        # Resize based on quality setting
+        if quality == "Low":
+            max_size = 800
+        elif quality == "Medium":
+            max_size = 1024
+        else:  # High
+            max_size = 1920
+        # Resize image if needed
+        h, w = image.shape[:2]
+        if max(h, w) > max_size:
+            scale = max_size / max(h, w)
+            new_h, new_w = int(h * scale), int(w * scale)
+            image = cv2.resize(image, (new_w, new_h))
+        # Convert to PIL Image for Grounding DINO
+        pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+        # Grounding DINO inference
+        inputs = grounding_dino_processor(images=pil_image, text=text_prompt, return_tensors="pt").to(device)
         with torch.no_grad():
+            outputs = grounding_dino_model(**inputs)
         # Post-process results
+        results = grounding_dino_processor.post_process_grounded_object_detection(
             outputs,
             inputs.input_ids,
+            box_threshold=box_threshold,
+            text_threshold=text_threshold,
+            target_sizes=[pil_image.size[::-1]]
+        )[0]
+        # Extract boxes and labels
+        boxes = results["boxes"].cpu().numpy()
+        labels = results["labels"]
+        if len(boxes) == 0:
+            return image, "No objects detected. Try adjusting the thresholds or text prompt."
+        # Convert boxes to xyxy format for SAM
+        boxes_xyxy = boxes
+        # SAM inference
+        sam_predictor.set_image(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
         masks = []
+        for box in boxes_xyxy:
+            mask, _, _ = sam_predictor.predict(
+                box=box,
+                multimask_output=False
+            )
             masks.append(mask[0])
+        # Visualize results
+        result_image = image.copy()
         # Draw masks
+        for i, mask in enumerate(masks):
+            color = np.random.randint(0, 255, 3).tolist()
+            result_image[mask] = result_image[mask] * 0.5 + np.array(color) * 0.5
+        # Draw boxes and labels
+        for i, (box, label) in enumerate(zip(boxes_xyxy, labels)):
             x1, y1, x2, y2 = map(int, box)
+            color = np.random.randint(0, 255, 3).tolist()
+            cv2.rectangle(result_image, (x1, y1), (x2, y2), color, 2)
+            cv2.putText(result_image, label, (x1, y1-10),
+                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+        metadata = f"✅ Detected {len(boxes)} objects: {', '.join(labels)}"
+        return result_image, metadata
     except Exception as e:
+        return image, f"❌ Error: {str(e)}"
+# Gradio Interface
+with gr.Blocks(title="Grounded SAM") as demo:
+    gr.Markdown("# 🎯 Grounded SAM - Object Detection & Segmentation")
+    gr.Markdown("Upload an image and describe what you want to detect (e.g., 'fish', 'all fish', 'person').")
     with gr.Row():
         with gr.Column():
+            input_image = gr.Image(label="Input Image", type="numpy")
+            text_prompt = gr.Textbox(
+                label="Text Prompt",
+                placeholder="e.g., 'fish', 'person', 'car'",
+                value="fish"
             )
+            with gr.Accordion("Advanced Settings", open=False):
+                box_threshold = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.35, step=0.05,
+                    label="Box Threshold (detection confidence)"
+                )
+                text_threshold = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.25, step=0.05,
+                    label="Text Threshold (text matching confidence)"
+                )
+                quality = gr.Radio(
+                    choices=["Low", "Medium", "High"],
+                    value="Medium",
+                    label="Processing Quality"
+                )
+            submit_btn = gr.Button("🚀 Process Image", variant="primary")
         with gr.Column():
+            output_image = gr.Image(label="Output with Masks & Boxes", type="numpy")
+            output_metadata = gr.Textbox(label="Detection Metadata", lines=3)
     submit_btn.click(
+        fn=process_image,
+        inputs=[input_image, text_prompt, box_threshold, text_threshold, quality],
         outputs=[output_image, output_metadata]
     )
     gr.Examples(
         examples=[
+            ["examples/fish1.jpg", "fish", 0.35, 0.25, "Medium"],
+            ["examples/fish2.jpg", "all fish", 0.35, 0.25, "Medium"],
         ],
+        inputs=[input_image, text_prompt, box_threshold, text_threshold, quality],
     )
+demo.launch()