Spaces:

EnginDev
/

Boostly

Running

App Files Files Community

EnginDev commited on Oct 14, 2025

Commit

86db70f

verified ·

1 Parent(s): 9292061

Update app.py

Browse files

Files changed (1) hide show

app.py +215 -392

app.py CHANGED Viewed

@@ -3,445 +3,268 @@ import torch
 import numpy as np
 from PIL import Image
 import cv2
-print("🚀 Starting SAM2 FishBoost Edition v4.0 - ULTRA OPTIMIZED...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"📱 Using device: {device}")
-model = None
-processor = None
-def load_model():
-    global model, processor
-    if model is None:
         print("📦 Loading SAM model...")
-        try:
-            from transformers import SamModel, SamProcessor
-            model_name = "facebook/sam-vit-large"
-            processor = SamProcessor.from_pretrained(model_name)
-            model = SamModel.from_pretrained(model_name)
-            model.to(device)
-            print(f"✅ Model loaded: {model_name}")
-        except Exception as e:
-            print(f"❌ Error: {e}, falling back to base model")
-            model_name = "facebook/sam-vit-base"
-            processor = SamProcessor.from_pretrained(model_name)
-            model = SamModel.from_pretrained(model_name)
-            model.to(device)
-    return model, processor
-def prepare_image(image, max_size=1024):
-    if isinstance(image, np.ndarray):
-        image_pil = Image.fromarray(image)
-    else:
-        image_pil = image
-    if image_pil.mode != 'RGB':
-        image_pil = image_pil.convert('RGB')
-    image_np = np.array(image_pil)
-    h, w = image_np.shape[:2]
-    if max(h, w) > max_size:
-        scale = max_size / max(h, w)
-        new_h, new_w = int(h * scale), int(w * scale)
-        image_pil = image_pil.resize((new_w, new_h), Image.Resampling.LANCZOS)
-        image_np = np.array(image_pil)
-    return image_pil, image_np
-def refine_mask(mask, kernel_size=5):
-    """Glättet Maskenkanten"""
-    mask_uint8 = (mask > 0).astype(np.uint8) * 255
-    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
-    mask_closed = cv2.morphologyEx(mask_uint8, cv2.MORPH_CLOSE, kernel)
-    mask_refined = cv2.morphologyEx(mask_closed, cv2.MORPH_OPEN, kernel)
-    return mask_refined > 0
-def calculate_mask_center(mask):
-    """Berechnet Schwerpunkt der Maske"""
-    y_coords, x_coords = np.where(mask)
-    if len(x_coords) == 0:
-        return None, None
-    return np.mean(x_coords), np.mean(y_coords)
-def extract_contours_from_mask(mask):
-    """Extrahiert Konturen als [{x, y}, ...] Format"""
-    contours, _ = cv2.findContours(
-        mask.astype(np.uint8),
-        cv2.RETR_EXTERNAL,
-        cv2.CHAIN_APPROX_SIMPLE
     )
-    if not contours:
-        return []
-    # Größte Kontur wählen
-    largest_contour = max(contours, key=cv2.contourArea)
-    # Format konvertieren: [[x, y]] -> [{x: int, y: int}]
-    points = []
-    for point in largest_contour:
-        x, y = point[0]
-        points.append({"x": int(x), "y": int(y)})
-    return points
-def generate_grid_points(w, h, grid_size=3):
-    """Generiert Grid-Punkte über das Bild verteilt"""
-    points = []
-    for i in range(1, grid_size + 1):
-        for j in range(1, grid_size + 1):
-            x = int(w * i / (grid_size + 1))
-            y = int(h * j / (grid_size + 1))
-            points.append([x, y])
-    return points
-def select_best_fish_mask(all_masks, all_scores, image_shape):
-    """
-    🎣 ULTRA-INTELLIGENTE FISCH-AUSWAHL
-    Strategie:
-    1. Filtere sehr große Masken (>15% = Hintergrund/Angler)
-    2. Filtere sehr kleine Masken (<2% = Noise)
-    3. Wähle KLEINSTE verbleibende Maske (= Fisch)
-    """
-    h, w = image_shape
-    image_center_x, image_center_y = w / 2, h / 2
-    total_pixels = h * w
-    valid_masks = []
-    print(f"\n🔍 Analyzing {len(all_masks)} candidate masks...")
-    for mask_data in all_masks:
-        mask = mask_data['mask']
-        score = mask_data['score']
-        point = mask_data['point']
-        # Coverage berechnen
-        mask_area = np.sum(mask)
-        coverage = mask_area / total_pixels
-        # 🚫 FILTER 1: Zu groß (Hintergrund/Angler)
-        if coverage > 0.15:  # 15% Threshold (vorher 60%)
-            print(f"  ❌ Rejected: Coverage {coverage*100:.1f}% > 15% (Background)")
-            continue
-        # 🚫 FILTER 2: Zu klein (Noise)
-        if coverage < 0.02:  # 2% Minimum
-            print(f"  ❌ Rejected: Coverage {coverage*100:.1f}% < 2% (Noise)")
-            continue
-        # 🚫 FILTER 3: Schlechter Score
-        if score < 0.7:
-            print(f"  ❌ Rejected: Score {score:.3f} < 0.7")
-            continue
-        # Center Distance berechnen
-        center_x, center_y = calculate_mask_center(mask)
-        if center_x is None:
-            continue
-        distance_to_center = np.sqrt(
-            (center_x - image_center_x)**2 +
-            (center_y - image_center_y)**2
-        )
-        valid_masks.append({
-            'mask': mask,
-            'score': score,
-            'area': mask_area,
-            'coverage': coverage,
-            'center': (center_x, center_y),
-            'distance_to_center': distance_to_center,
-            'point': point
-        })
-        print(f"  ✅ Valid: coverage={coverage*100:.1f}%, score={score:.3f}, dist={distance_to_center:.0f}px")
-    if not valid_masks:
-        print("  ❌ No valid fish masks found!")
-        return None
-    # 🎯 STRATEGIE: Wähle KLEINSTE Maske (= Fisch, nicht Angler)
-    valid_masks.sort(key=lambda m: m['coverage'])
-    best_mask = valid_masks[0]
-    print(f"\n  🏆 SELECTED: Smallest mask (coverage: {best_mask['coverage']*100:.1f}%)")
-    return best_mask
-def segment_automatic(image, quality="high", mode="fish"):
-    """
-    🎣 ULTRA-OPTIMIZED Fish Detection
-    - Multi-Point Grid (9 Punkte statt nur Mitte)
-    - 15% Coverage Filter (statt 60%)
-    - Kleinste Maske = Fisch
-    """
     if image is None:
-        return None, {"error": "Kein Bild hochgeladen"}
     try:
-        print(f"\n{'='*60}")
-        print(f"🔄 Starting ULTRA segmentation (mode: {mode}, quality: {quality})")
-        print(f"{'='*60}")
-        model, processor = load_model()
-        image_pil, image_np = prepare_image(image)
-        h, w = image_np.shape[:2]
-        if mode == "fish":
-            # 🆕 MULTI-POINT GRID (statt nur Bildmitte)
-            grid_points = generate_grid_points(w, h, grid_size=3)
-            print(f"📍 Using {len(grid_points)} grid points for detection")
         else:
-            # Fallback: nur Bildmitte
-            grid_points = [[w // 2, h // 2]]
-        all_masks = []
-        # Für jeden Grid-Punkt: Maske generieren
-        for idx, point in enumerate(grid_points):
-            inputs = processor(
-                image_pil,
-                input_points=[[point]],
-                input_labels=[[1]],
-                return_tensors="pt"
-            ).to(device)
-            with torch.no_grad():
-                outputs = model(**inputs, multimask_output=True)
-            masks = processor.image_processor.post_process_masks(
-                outputs.pred_masks.cpu(),
-                inputs["original_sizes"].cpu(),
-                inputs["reshaped_input_sizes"].cpu()
-            )[0]
-            scores = outputs.iou_scores.cpu().numpy().flatten()
-            # Beste Maske dieses Punktes
-            best_idx = np.argmax(scores)
-            if masks.ndim == 4:
-                mask = masks[0, best_idx].numpy()
-            else:
-                mask = masks[best_idx].numpy()
-            all_masks.append({
-                'mask': mask > 0,
-                'score': scores[best_idx],
-                'point': point
-            })
-        print(f"✅ Generated {len(all_masks)} masks from grid points")
-        # 🎣 BESTE FISCH-MASKE WÄHLEN
-        best_fish = select_best_fish_mask(all_masks, None, (h, w))
-        if best_fish is None:
-            return None, {
-                "error": "No fish detected. Image might contain only background/angler.",
-                "suggestion": "Try 'Multi-Object' mode or use a different image."
-            }
-        final_mask = best_fish['mask']
-        # Refinement
-        if quality == "high":
-            print("🎨 Refining mask edges...")
-            final_mask = refine_mask(final_mask, kernel_size=7)
-        # 🆕 KONTUREN EXTRAHIEREN
-        contours_list = extract_contours_from_mask(final_mask)
-        # Overlay erstellen
         overlay = image_np.copy()
-        color = np.array([0, 255, 100])  # Grün für Fisch
-        mask_float = final_mask.astype(float)
-        if quality == "high":
-            mask_float = cv2.GaussianBlur(mask_float, (5, 5), 0)
-        for c in range(3):
-            overlay[:, :, c] = (
-                overlay[:, :, c] * (1 - mask_float * 0.65) +
-                color[c] * mask_float * 0.65
-            )
-        # Kontur zeichnen
-        contours_cv, _ = cv2.findContours(
-            final_mask.astype(np.uint8),
-            cv2.RETR_EXTERNAL,
-            cv2.CHAIN_APPROX_SIMPLE
-        )
-        cv2.drawContours(overlay, contours_cv, -1, (255, 255, 0), 3)
-        # Metadata
-        mask_area = int(np.sum(final_mask))
-        mask_percentage = float(mask_area / (h * w) * 100)
-        metadata = {
-            "success": True,
-            "mode": "automatic_fish_ultra_optimized",
-            "quality": quality,
-            "detection_method": "multi_point_grid" if mode == "fish" else "center_point",
-            "grid_points_used": len(grid_points),
-            "image_size": [w, h],
-            "mask_area": mask_area,
-            "mask_percentage": mask_percentage,
-            "num_contours": len(contours_cv),
-            "fish_score": float(best_fish['score']),
-            "fish_center": [float(best_fish['center'][0]), float(best_fish['center'][1])],
-            "device": device,
-            "contours": contours_list
-        }
-        print(f"\n{'='*60}")
-        print(f"✅ SEGMENTATION COMPLETE!")
-        print(f"   Fish coverage: {mask_percentage:.1f}%")
-        print(f"   Confidence: {best_fish['score']*100:.1f}%")
-        print(f"   Contour points: {len(contours_list)}")
-        print(f"{'='*60}\n")
-        return Image.fromarray(overlay.astype(np.uint8)), metadata
     except Exception as e:
         import traceback
-        print(f"❌ ERROR:\n{traceback.format_exc()}")
-        return image, {"error": str(e)}
 # Gradio Interface
-demo = gr.Blocks(title="SAM2 FishBoost ULTRA", theme=gr.themes.Soft())
-with demo:
-    gr.Markdown("# 🎣 SAM2 FishBoost ULTRA v4.0")
-    gr.Markdown("### Multi-Point Grid Detection + 15% Coverage Filter")
-    with gr.Tab("🎣 Fish Detection (ULTRA)"):
-        gr.Markdown("""
-        **🚀 NEUE FEATURES:**
-        - ✅ 9-Punkt Grid Detection (nicht nur Bildmitte!)
-        - ✅ 15% Coverage Filter (filtert Angler/Hintergrund)
-        - ✅ Kleinste Maske = Fisch
-        """)
-        with gr.Row():
-            with gr.Column():
-                input_fish = gr.Image(type="pil", label="📸 Bild hochladen")
-                quality_radio = gr.Radio(
-                    choices=["high", "fast"],
-                    value="high",
-                    label="⚙️ Qualität"
-                )
-                mode_radio = gr.Radio(
-                    choices=["fish", "multi"],
-                    value="fish",
-                    label="🎯 Modus",
-                    info="Fish = Multi-Point Grid, Multi = Center Only"
-                )
-                btn_fish = gr.Button("🎣 Fisch segmentieren", variant="primary", size="lg")
-                gr.Markdown("""
-                **💡 Wie es funktioniert:**
-                **Fish Mode (ULTRA):**
-                1. Scannt Bild mit 9 Punkten (3x3 Grid)
-                2. Ignoriert große Objekte (>15% = Angler)
-                3. Ignoriert kleine Objekte (<2% = Noise)
-                4. Wählt kleinste Maske (= Fisch!)
-                **Multi Mode:**
-                - Alte Methode (nur Bildmitte)
-                - Für allgemeine Objekte
-                """)
-            with gr.Column():
-                output_fish = gr.Image(label="✨ Segmentierter Fisch")
-                json_fish = gr.JSON(label="📊 Metadata")
-        btn_fish.click(
-            fn=segment_automatic,
-            inputs=[input_fish, quality_radio, mode_radio],
-            outputs=[output_fish, json_fish]
-        )
-        gr.Examples(
-            examples=[],
-            inputs=input_fish,
-            label="💡 Upload dein Angelfoto!"
-        )
-    with gr.Tab("📡 API Integration (Lovable)"):
-        gr.Markdown("### 🔗 API Endpoint")
-        gr.Code("https://EnginDev-Boostly.hf.space/api/predict", label="Base URL")
-        gr.Markdown("### 📝 JavaScript Code")
-        gr.Code('''
-// Fish Detection ULTRA
-const response = await fetch('https://EnginDev-Boostly.hf.space/api/predict', {
-  method: 'POST',
-  headers: {'Content-Type': 'application/json'},
-  body: JSON.stringify({
-    data: [
-      base64Image,    // Base64 image
-      "high",         // quality: "high" | "fast"
-      "fish"          // mode: "fish" (ULTRA) | "multi"
-    ],
-    fn_index: 0
-  })
-});
-const result = await response.json();
-// Expected Response:
-{
-  "data": [
-    "data:image/png;base64,iVBORw...",  // Segmented overlay
-    {
-      "success": true,
-      "mode": "automatic_fish_ultra_optimized",
-      "detection_method": "multi_point_grid",
-      "grid_points_used": 9,
-      "mask_percentage": 8.2,  // Nur der Fisch! (nicht 86%)
-      "fish_score": 0.98,
-      "fish_center": [385, 520],
-      "contours": [
-        {"x": 350, "y": 450},
-        {"x": 351, "y": 451},
-        // ... präzise Fisch-Kontur
-      ]
-    }
-  ]
-}
-        ''', language="javascript")
-        gr.Markdown("""
-        ### ⚙️ Parameter Erklärung
-        **mode: "fish"** (ULTRA - EMPFOHLEN für Angelfotos)
-        - Multi-Point Grid (9 Erkennungspunkte)
-        - 15% Coverage Filter
-        - Kleinste Maske = Fisch
-        - ✅ Perfekt für: Angler mit Fisch im Bild
-        **mode: "multi"**
-        - Center-Point Only (alte Methode)
-        - 60% Coverage Filter
-        - ✅ Für allgemeine Objekte
-        **quality:**
-        - `"high"` = Präzise Kanten, Gaussian Blur (~20s)
-        - `"fast"` = Schneller, weniger Nachbearbeitung (~10s)
-        """)
 if __name__ == "__main__":
-    print("🌐 Launching FishBoost SAM2 ULTRA v4.0...")
-    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)

 import numpy as np
 from PIL import Image
 import cv2
+from groundingdino.util.inference import Model as GroundingDINOModel
+from segment_anything import sam_model_registry, SamPredictor
+import supervision as sv
+print("🚀 Starting Grounded SAM FishBoost Edition v5.0...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"📱 Using device: {device}")
+grounding_dino_model = None
+sam_predictor = None
+def load_models():
+    """Load Grounding DINO + SAM models"""
+    global grounding_dino_model, sam_predictor
+    if grounding_dino_model is None:
+        print("📦 Loading Grounding DINO model...")
+        grounding_dino_model = GroundingDINOModel(
+            model_config_path="GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
+            model_checkpoint_path="weights/groundingdino_swint_ogc.pth",
+            device=device
+        )
+        print("✅ Grounding DINO loaded!")
+    if sam_predictor is None:
         print("📦 Loading SAM model...")
+        sam_checkpoint = "weights/sam_vit_h_4b8939.pth"
+        model_type = "vit_h"
+        sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+        sam.to(device=device)
+        sam_predictor = SamPredictor(sam)
+        print("✅ SAM loaded!")
+def detect_fish_with_grounded_sam(image_pil, text_prompt="fish", box_threshold=0.25, text_threshold=0.25):
+    """
+    Detect and segment fish using Grounding DINO + SAM
+    Args:
+        image_pil: PIL Image
+        text_prompt: Text prompt for detection (default: "fish")
+        box_threshold: Confidence threshold for boxes
+        text_threshold: Confidence threshold for text matching
+    Returns:
+        mask: Binary mask of detected fish
+        metadata: Detection metadata
+    """
+    load_models()
+    # Convert PIL to numpy
+    image_np = np.array(image_pil)
+    # 1. Grounding DINO: Detect fish boxes
+    print(f"🔍 Detecting '{text_prompt}' with Grounding DINO...")
+    detections = grounding_dino_model.predict_with_classes(
+        image=image_np,
+        classes=[text_prompt],
+        box_threshold=box_threshold,
+        text_threshold=text_threshold
     )
+    print(f"📦 Found {len(detections.xyxy)} boxes")
+    if len(detections.xyxy) == 0:
+        print("❌ No fish detected!")
+        return None, {
+            "success": False,
+            "mode": "grounded_sam",
+            "detection_method": "grounding_dino",
+            "fish_detected": False,
+            "reason": "No fish found in image"
+        }
+    # Select best detection (highest confidence)
+    best_idx = np.argmax(detections.confidence)
+    best_box = detections.xyxy[best_idx]
+    best_conf = float(detections.confidence[best_idx])
+    print(f"🎯 Best detection: Confidence={best_conf:.2f}, Box={best_box}")
+    # 2. SAM: Segment the detected fish
+    print("✂️ Segmenting with SAM...")
+    sam_predictor.set_image(image_np)
+    # Convert box to SAM format
+    box_np = best_box.reshape(1, 4)
+    masks, scores, _ = sam_predictor.predict(
+        box=box_np,
+        multimask_output=False
+    )
+    mask = masks[0]  # Get best mask
+    # Calculate statistics
+    mask_area = int(np.sum(mask))
+    total_pixels = mask.shape[0] * mask.shape[1]
+    mask_percentage = (mask_area / total_pixels) * 100
+    # Get contours
+    contours, _ = cv2.findContours(
+        mask.astype(np.uint8),
+        cv2.RETR_EXTERNAL,
+        cv2.CHAIN_APPROX_SIMPLE
+    )
+    # Get fish center
+    if len(contours) > 0:
+        largest_contour = max(contours, key=cv2.contourArea)
+        M = cv2.moments(largest_contour)
+        if M["m00"] != 0:
+            cx = int(M["m10"] / M["m00"])
+            cy = int(M["m01"] / M["m00"])
+        else:
+            cx, cy = int(best_box[0] + best_box[2]) // 2, int(best_box[1] + best_box[3]) // 2
+    else:
+        cx, cy = int(best_box[0] + best_box[2]) // 2, int(best_box[1] + best_box[3]) // 2
+    # Convert contours to list format
+    contour_points = []
+    if len(contours) > 0:
+        for point in contours[0][:100]:  # Limit to 100 points
+            contour_points.append({
+                "x": int(point[0][0]),
+                "y": int(point[0][1])
+            })
+    metadata = {
+        "success": True,
+        "mode": "grounded_sam",
+        "detection_method": "grounding_dino_sam",
+        "fish_detected": True,
+        "grounding_dino": {
+            "confidence": best_conf,
+            "bounding_box": [int(x) for x in best_box],
+            "text_prompt": text_prompt,
+            "total_detections": len(detections.xyxy)
+        },
+        "mask_area": mask_area,
+        "mask_percentage": mask_percentage,
+        "num_contours": len(contours),
+        "fish_center": [cx, cy],
+        "image_size": list(mask.shape),
+        "device": device,
+        "contours": contour_points
+    }
+    print(f"✅ Segmentation complete! Mask: {mask_percentage:.2f}%")
+    return mask, metadata
+def process_image(image, quality="high"):
+    """Main processing function for Gradio interface"""
     if image is None:
+        return None, "❌ No image provided"
     try:
+        # Convert to PIL if needed
+        if isinstance(image, np.ndarray):
+            image_pil = Image.fromarray(image)
         else:
+            image_pil = image
+        # Resize for faster processing on CPU
+        max_size = 1024 if quality == "high" else 768
+        image_pil.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
+        # Detect and segment fish
+        mask, metadata = detect_fish_with_grounded_sam(image_pil, text_prompt="fish")
+        if mask is None:
+            return None, f"❌ No fish detected!\n\n{metadata}"
+        # Create visualization
+        image_np = np.array(image_pil)
+        # Apply green overlay on fish
         overlay = image_np.copy()
+        overlay[mask] = [0, 255, 0]  # Green
+        result = cv2.addWeighted(image_np, 0.7, overlay, 0.3, 0)
+        # Draw bounding box
+        box = metadata["grounding_dino"]["bounding_box"]
+        cv2.rectangle(result, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 2)
+        # Add confidence text
+        conf_text = f"Fish: {metadata['grounding_dino']['confidence']:.2f}"
+        cv2.putText(result, conf_text, (box[0], box[1] - 10),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
+        # Format metadata for display
+        meta_str = f"""✅ Fish detected successfully!
+🎯 Grounding DINO
+   Confidence: {metadata['grounding_dino']['confidence']:.2%}
+   Bounding Box: {metadata['grounding_dino']['bounding_box']}
+   Detections: {metadata['grounding_dino']['total_detections']}
+✂️ SAM Segmentation
+   Mask Area: {metadata['mask_percentage']:.2f}%
+   Fish Center: {metadata['fish_center']}
+   Contours: {metadata['num_contours']}
+⚙️ System
+   Device: {metadata['device']}
+   Image Size: {metadata['image_size']}
+"""
+        return result, meta_str
     except Exception as e:
+        print(f"❌ Error: {str(e)}")
         import traceback
+        traceback.print_exc()
+        return None, f"❌ Error: {str(e)}"
 # Gradio Interface
+with gr.Blocks(title="🎣 FishBoost - Grounded SAM Edition") as demo:
+    gr.Markdown("""
+    # 🎣 FishBoost - Grounded SAM Fish Detector
+    ### Powered by Grounding DINO + SAM
+    Upload an image with a fish and watch the AI detect and segment it!
+    ⚠️ **CPU Mode**: First run downloads ~680MB models (2-3 min). Processing: ~30-60 sec per image.
+    """)
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="pil", label="📤 Upload Fish Image")
+            quality = gr.Radio(
+                choices=["high", "medium"],
+                value="high",
+                label="🎨 Quality",
+                info="High = 1024px, Medium = 768px (faster)"
+            )
+            process_btn = gr.Button("🚀 Detect Fish", variant="primary")
+        with gr.Column():
+            output_image = gr.Image(label="🎯 Detected Fish (Green = Mask, Blue = Box)")
+            output_meta = gr.Textbox(label="📊 Detection Metadata", lines=15)
+    process_btn.click(
+        fn=process_image,
+        inputs=[input_image, quality],
+        outputs=[output_image, output_meta]
+    )
+    gr.Markdown("""
+    ---
+    ### 🔧 How it works
+    1. **Grounding DINO** finds fish bounding boxes using text prompt "fish"
+    2. **SAM** segments the exact fish shape within the box
+    3. **Result**: Precise fish mask ignoring angler/background
+    ### 📝 Model Info
+    - Grounding DINO: Text-prompted object detection
+    - SAM (ViT-H): High-quality segmentation
+    - Total Model Size: ~680MB
+    """)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)