Spaces:

EnginDev
/

Grounded_sam_boostly

Runtime error

App Files Files Community

EnginDev commited on Oct 14, 2025

Commit

851e42c

verified ·

1 Parent(s): 84e5715

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -232

app.py CHANGED Viewed

@@ -1,270 +1,154 @@
 import gradio as gr
-import torch
 import numpy as np
 from PIL import Image
-import cv2
-from groundingdino.util.inference import Model as GroundingDINOModel
 from segment_anything import sam_model_registry, SamPredictor
 import supervision as sv
-print("🚀 Starting Grounded SAM FishBoost Edition v5.0...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"📱 Using device: {device}")
-grounding_dino_model = None
-sam_predictor = None
-def load_models():
-    """Load Grounding DINO + SAM models"""
-    global grounding_dino_model, sam_predictor
-    if grounding_dino_model is None:
-        print("📦 Loading Grounding DINO model...")
-        grounding_dino_model = GroundingDINOModel(
-            model_config_path="GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
-            model_checkpoint_path="weights/groundingdino_swint_ogc.pth",
-            device=device
-        )
-        print("✅ Grounding DINO loaded!")
-    if sam_predictor is None:
-        print("📦 Loading SAM model...")
-        sam_checkpoint = "weights/sam_vit_h_4b8939.pth"
-        model_type = "vit_h"
-        sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
-        sam.to(device=device)
-        sam_predictor = SamPredictor(sam)
-        print("✅ SAM loaded!")
-def detect_fish_with_grounded_sam(image_pil, text_prompt="fish", box_threshold=0.25, text_threshold=0.25):
     """
-    Detect and segment fish using Grounding DINO + SAM
-    Args:
-        image_pil: PIL Image
-        text_prompt: Text prompt for detection (default: "fish")
-        box_threshold: Confidence threshold for boxes
-        text_threshold: Confidence threshold for text matching
-    Returns:
-        mask: Binary mask of detected fish
-        metadata: Detection metadata
     """
-    load_models()
-    # Convert PIL to numpy
-    image_np = np.array(image_pil)
-    # 1. Grounding DINO: Detect fish boxes
-    print(f"🔍 Detecting '{text_prompt}' with Grounding DINO...")
-    detections = grounding_dino_model.predict_with_classes(
-        image=image_np,
-        classes=[text_prompt],
-        box_threshold=box_threshold,
-        text_threshold=text_threshold
-    )
-    print(f"📦 Found {len(detections.xyxy)} boxes")
-    if len(detections.xyxy) == 0:
-        print("❌ No fish detected!")
-        return None, {
-            "success": False,
-            "mode": "grounded_sam",
-            "detection_method": "grounding_dino",
-            "fish_detected": False,
-            "reason": "No fish found in image"
-        }
-    # Select best detection (highest confidence)
-    best_idx = np.argmax(detections.confidence)
-    best_box = detections.xyxy[best_idx]
-    best_conf = float(detections.confidence[best_idx])
-    print(f"🎯 Best detection: Confidence={best_conf:.2f}, Box={best_box}")
-    # 2. SAM: Segment the detected fish
-    print("✂️ Segmenting with SAM...")
-    sam_predictor.set_image(image_np)
-    # Convert box to SAM format
-    box_np = best_box.reshape(1, 4)
-    masks, scores, _ = sam_predictor.predict(
-        box=box_np,
-        multimask_output=False
-    )
-    mask = masks[0]  # Get best mask
-    # Calculate statistics
-    mask_area = int(np.sum(mask))
-    total_pixels = mask.shape[0] * mask.shape[1]
-    mask_percentage = (mask_area / total_pixels) * 100
-    # Get contours
-    contours, _ = cv2.findContours(
-        mask.astype(np.uint8),
-        cv2.RETR_EXTERNAL,
-        cv2.CHAIN_APPROX_SIMPLE
-    )
-    # Get fish center
-    if len(contours) > 0:
-        largest_contour = max(contours, key=cv2.contourArea)
-        M = cv2.moments(largest_contour)
-        if M["m00"] != 0:
-            cx = int(M["m10"] / M["m00"])
-            cy = int(M["m01"] / M["m00"])
-        else:
-            cx, cy = int(best_box[0] + best_box[2]) // 2, int(best_box[1] + best_box[3]) // 2
-    else:
-        cx, cy = int(best_box[0] + best_box[2]) // 2, int(best_box[1] + best_box[3]) // 2
-    # Convert contours to list format
-    contour_points = []
-    if len(contours) > 0:
-        for point in contours[0][:100]:  # Limit to 100 points
-            contour_points.append({
-                "x": int(point[0][0]),
-                "y": int(point[0][1])
-            })
-    metadata = {
-        "success": True,
-        "mode": "grounded_sam",
-        "detection_method": "grounding_dino_sam",
-        "fish_detected": True,
-        "grounding_dino": {
-            "confidence": best_conf,
-            "bounding_box": [int(x) for x in best_box],
-            "text_prompt": text_prompt,
-            "total_detections": len(detections.xyxy)
-        },
-        "mask_area": mask_area,
-        "mask_percentage": mask_percentage,
-        "num_contours": len(contours),
-        "fish_center": [cx, cy],
-        "image_size": list(mask.shape),
-        "device": device,
-        "contours": contour_points
-    }
-    print(f"✅ Segmentation complete! Mask: {mask_percentage:.2f}%")
-    return mask, metadata
-def process_image(image, quality="high"):
-    """Main processing function for Gradio interface"""
-    if image is None:
-        return None, "❌ No image provided"
     try:
-        # Convert to PIL if needed
-        if isinstance(image, np.ndarray):
-            image_pil = Image.fromarray(image)
-        else:
-            image_pil = image
-        # Resize for faster processing on CPU
-        max_size = 1024 if quality == "high" else 768
-        image_pil.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
-        # Detect and segment fish
-        mask, metadata = detect_fish_with_grounded_sam(image_pil, text_prompt="fish")
-        if mask is None:
-            return None, f"❌ No fish detected!\n\n{metadata}"
         # Create visualization
-        image_np = np.array(image_pil)
-        # Apply green overlay on fish
-        overlay = image_np.copy()
-        overlay[mask] = [0, 255, 0]  # Green
-        result = cv2.addWeighted(image_np, 0.7, overlay, 0.3, 0)
-        # Draw bounding box
-        box = metadata["grounding_dino"]["bounding_box"]
-        cv2.rectangle(result, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 2)
-        # Add confidence text
-        conf_text = f"Fish: {metadata['grounding_dino']['confidence']:.2f}"
-        cv2.putText(result, conf_text, (box[0], box[1] - 10),
-                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
-        # Format metadata for display
-        meta_str = f"""✅ Fish detected successfully!
-🎯 Grounding DINO
-   Confidence: {metadata['grounding_dino']['confidence']:.2%}
-   Bounding Box: {metadata['grounding_dino']['bounding_box']}
-   Detections: {metadata['grounding_dino']['total_detections']}
-✂️ SAM Segmentation
-   Mask Area: {metadata['mask_percentage']:.2f}%
-   Fish Center: {metadata['fish_center']}
-   Contours: {metadata['num_contours']}
-⚙️ System
-   Device: {metadata['device']}
-   Image Size: {metadata['image_size']}
-"""
-        return result, meta_str
     except Exception as e:
-        print(f"❌ Error: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return None, f"❌ Error: {str(e)}"
-# Gradio Interface
-with gr.Blocks(title="🎣 FishBoost - Grounded SAM Edition") as demo:
-    gr.Markdown("""
-    # 🎣 FishBoost - Grounded SAM Fish Detector
-    ### Powered by Grounding DINO + SAM
-    Upload an image with a fish and watch the AI detect and segment it!
-    ⚠️ **CPU Mode**: First run downloads ~680MB models (2-3 min). Processing: ~30-60 sec per image.
-    """)
     with gr.Row():
         with gr.Column():
-            input_image = gr.Image(type="pil", label="📤 Upload Fish Image")
             quality = gr.Radio(
-                choices=["high", "medium"],
-                value="high",
-                label="🎨 Quality",
-                info="High = 1024px, Medium = 768px (faster)"
             )
-            process_btn = gr.Button("🚀 Detect Fish", variant="primary")
         with gr.Column():
-            output_image = gr.Image(label="🎯 Detected Fish (Green = Mask, Blue = Box)")
-            output_meta = gr.Textbox(label="📊 Detection Metadata", lines=15)
-    process_btn.click(
-        fn=process_image,
-        inputs=[input_image, quality],
-        outputs=[output_image, output_meta]
     )
-    gr.Markdown("""
-    ---
-    ### 🔧 How it works
-    1. **Grounding DINO** finds fish bounding boxes using text prompt "fish"
-    2. **SAM** segments the exact fish shape within the box
-    3. **Result**: Precise fish mask ignoring angler/background
-    ### 📝 Model Info
-    - Grounding DINO: Text-prompted object detection
-    - SAM (ViT-H): High-quality segmentation
-    - Total Model Size: ~680MB
-    """)
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import numpy as np
 from PIL import Image
+import torch
+from transformers import pipeline
+from groundingdino.util.inference import load_model, load_image, predict
 from segment_anything import sam_model_registry, SamPredictor
 import supervision as sv
+import cv2
+import os
+# Download models on startup
+print("Loading models...")
+# Load Grounding DINO model from Hugging Face
+# Using a different approach that doesn't require local config files
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
+dino_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny")
+# Load SAM model
+sam_checkpoint = "sam_vit_h_4b8939.pth"
+model_type = "vit_h"
+# Download SAM weights if not present
+if not os.path.exists(sam_checkpoint):
+    os.system(f"wget https://dl.fbaipublicfiles.com/segment_anything/{sam_checkpoint}")
+sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+sam_predictor = SamPredictor(sam)
+print("Models loaded successfully!")
+def detect_and_segment(image, text_prompt="fish", quality="Medium (512px)"):
     """
+    Detect objects using Grounding DINO and segment using SAM
     """
     try:
+        # Resize image based on quality setting
+        quality_map = {
+            "Low (256px)": 256,
+            "Medium (512px)": 512,
+            "High (1024px)": 1024
+        }
+        target_size = quality_map.get(quality, 512)
+        # Convert PIL to numpy
+        image_np = np.array(image)
+        h, w = image_np.shape[:2]
+        # Resize maintaining aspect ratio
+        scale = min(target_size / w, target_size / h)
+        new_w, new_h = int(w * scale), int(h * scale)
+        image_resized = cv2.resize(image_np, (new_w, new_h))
+        # Prepare image for Grounding DINO
+        inputs = dino_processor(images=image_resized, text=text_prompt, return_tensors="pt")
+        with torch.no_grad():
+            outputs = dino_model(**inputs)
+        # Post-process results
+        results = dino_processor.post_process_grounded_object_detection(
+            outputs,
+            inputs.input_ids,
+            box_threshold=0.25,
+            text_threshold=0.25,
+            target_sizes=[(new_h, new_w)]
+        )
+        if len(results) == 0 or len(results[0]["boxes"]) == 0:
+            return image, {"error": "No fish detected", "detections": 0}
+        # Get boxes and scores
+        boxes = results[0]["boxes"].cpu().numpy()
+        scores = results[0]["scores"].cpu().numpy()
+        # Use SAM to segment
+        sam_predictor.set_image(image_resized)
+        # Convert boxes to SAM format
+        masks = []
+        for box in boxes:
+            box_sam = np.array([box[0], box[1], box[2], box[3]])
+            mask, _, _ = sam_predictor.predict(box=box_sam, multimask_output=False)
+            masks.append(mask[0])
         # Create visualization
+        annotated_image = image_resized.copy()
+        # Draw masks
+        for mask in masks:
+            color_mask = np.zeros_like(annotated_image)
+            color_mask[mask] = [0, 255, 0]  # Green mask
+            annotated_image = cv2.addWeighted(annotated_image, 1, color_mask, 0.5, 0)
+        # Draw bounding boxes
+        for box in boxes:
+            x1, y1, x2, y2 = map(int, box)
+            cv2.rectangle(annotated_image, (x1, y1), (x2, y2), (0, 0, 255), 2)
+        # Calculate metadata
+        total_pixels = new_w * new_h
+        mask_pixels = sum(np.sum(mask) for mask in masks)
+        mask_percentage = (mask_pixels / total_pixels) * 100
+        metadata = {
+            "detections": len(boxes),
+            "avg_confidence": float(np.mean(scores)),
+            "image_size": f"{new_w}x{new_h}",
+            "mask_percentage": f"{mask_percentage:.2f}%"
+        }
+        return Image.fromarray(annotated_image), metadata
     except Exception as e:
+        return image, {"error": str(e)}
+# Create Gradio interface
+with gr.Blocks(title="Grounded SAM - Fish Detection") as demo:
+    gr.Markdown("# 🐟 Grounded SAM: Fish Detection & Segmentation")
+    gr.Markdown("Upload an image and detect fish using Grounding DINO + Segment Anything Model")
     with gr.Row():
         with gr.Column():
+            input_image = gr.Image(type="pil", label="Upload Image")
+            text_prompt = gr.Textbox(value="fish", label="Detection Prompt")
             quality = gr.Radio(
+                choices=["Low (256px)", "Medium (512px)", "High (1024px)"],
+                value="Medium (512px)",
+                label="Processing Quality"
             )
+            submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column():
+            output_image = gr.Image(label="Detection Result")
+            output_metadata = gr.JSON(label="Detection Metadata")
+    submit_btn.click(
+        fn=detect_and_segment,
+        inputs=[input_image, text_prompt, quality],
+        outputs=[output_image, output_metadata]
     )
+    gr.Examples(
+        examples=[
+            ["fish_angler.jpg", "fish", "High (1024px)"],
+        ],
+        inputs=[input_image, text_prompt, quality]
+    )
 if __name__ == "__main__":
+    demo.launch()