Spaces:

sumitsingh830
/

SAM2-Image-Auto-Segment

Sleeping

+import numpy as np
+import cv2
+import torch
+import sys
+import os
+# Add sam2 folder to path to import from local sam2 directory
+_current_file_dir = os.path.dirname(os.path.abspath(__file__))
+_project_root = os.path.dirname(_current_file_dir)
+_sam2_repo_dir = os.path.join(_project_root, "sam2")
+# Add sam2 directory to sys.path if not already there
+abs_sam2_dir = os.path.abspath(_sam2_repo_dir)
+if abs_sam2_dir not in sys.path:
+    sys.path.insert(0, abs_sam2_dir)
+from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
+from app.utils import mask_to_polygon
+# Hugging Face model ID for SAM2.1 Hiera Large model
+HUGGINGFACE_MODEL_ID = "facebook/sam2.1-hiera-large"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+class SAM2AutoAnnotation:
+    """
+    SAM2 Auto Annotation wrapper for automatically generating masks for all objects in an image.
+    Uses SAM2AutomaticMaskGenerator from Hugging Face.
+    """
+    def __init__(
+        self,
+        points_per_side: int = 32,
+        points_per_batch: int = 64,
+        pred_iou_thresh: float = 0.88,
+        stability_score_thresh: float = 0.95,
+        min_mask_region_area: int = 100,
+    ):
+        """
+        Initialize SAM2 Auto Annotation.
+        Args:
+            points_per_side: Number of points per side of the image grid
+            points_per_batch: Number of points to process in each batch
+            pred_iou_thresh: Prediction IoU threshold
+            stability_score_thresh: Stability score threshold
+            min_mask_region_area: Minimum mask region area in pixels
+        """
+        self.points_per_side = points_per_side
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.min_mask_region_area = min_mask_region_area
+        self._mask_generator = None
+    def _get_mask_generator(self):
+        """Lazy initialization of mask generator."""
+        if self._mask_generator is None:
+            try:
+                # Try to load with configuration parameters first
+                try:
+                    self._mask_generator = SAM2AutomaticMaskGenerator.from_pretrained(
+                        HUGGINGFACE_MODEL_ID,
+                        device=device,
+                        points_per_side=self.points_per_side,
+                        points_per_batch=self.points_per_batch,
+                        pred_iou_thresh=self.pred_iou_thresh,
+                        stability_score_thresh=self.stability_score_thresh,
+                        crop_n_layers=1,
+                        crop_n_points_downscale_factor=2,
+                        min_mask_region_area=self.min_mask_region_area,
+                    )
+                except TypeError:
+                    # If parameters are not accepted by from_pretrained, load without them
+                    self._mask_generator = SAM2AutomaticMaskGenerator.from_pretrained(
+                        HUGGINGFACE_MODEL_ID,
+                        device=device
+                    )
+                    # Try to set parameters if the generator supports it
+                    for attr_name in ['points_per_side', 'points_per_batch', 'pred_iou_thresh',
+                                     'stability_score_thresh', 'min_mask_region_area']:
+                        if hasattr(self._mask_generator, attr_name):
+                            setattr(self._mask_generator, attr_name, getattr(self, attr_name))
+            except ImportError as e:
+                raise RuntimeError(
+                    f"Failed to import required modules for SAM2. Please ensure 'sam2' and 'huggingface_hub' are installed. "
+                    f"Error: {str(e)}"
+                )
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to load SAM2 Automatic Mask Generator from Hugging Face ({HUGGINGFACE_MODEL_ID}). "
+                    f"Please check your internet connection and ensure the model ID is correct. "
+                    f"Error: {str(e)}"
+                )
+        return self._mask_generator
+    def generate_masks(
+        self,
+        image: np.ndarray,
+        min_confidence: float = 0.0,
+        min_area: int = None,
+        filter_blank_regions: bool = True,
+        scale_factors: tuple = (1.0, 1.0),
+    ) -> list:
+        """
+        Generate all masks for objects in the image.
+        Args:
+            image: Image as numpy array (RGB format, H, W, 3)
+            min_confidence: Minimum confidence score to filter masks (default: 0.0)
+            min_area: Minimum mask area in pixels (default: uses self.min_mask_region_area)
+            filter_blank_regions: Filter out blank/black regions (default: True)
+            scale_factors: Tuple (scale_x, scale_y) to scale coordinates FROM processed TO display size
+                          (matching predict_polygon_from_point logic)
+        Returns:
+            List of mask dictionaries, each containing:
+            - polygon: flattened coordinates [x1, y1, x2, y2, ...] (scaled to display size)
+            - confidence: confidence score
+            - area: mask area in pixels
+        """
+        if min_area is None:
+            min_area = self.min_mask_region_area
+        # Get mask generator
+        mask_generator = self._get_mask_generator()
+        # Generate all masks automatically
+        masks = mask_generator.generate(image)
+        # Convert image to grayscale for blank region detection
+        if filter_blank_regions:
+            if len(image.shape) == 3:
+                gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+            else:
+                gray_image = image
+        # Process masks and convert to polygons
+        results = []
+        for mask_data in masks:
+            # Extract mask information
+            mask = mask_data["segmentation"]  # Boolean mask
+            score = float(mask_data.get("stability_score", mask_data.get("predicted_iou", 0.0)))
+            area = int(mask_data.get("area", 0))
+            # Filter by confidence threshold
+            if score < min_confidence:
+                continue
+            # Filter by minimum area
+            if area < min_area:
+                continue
+            # Filter blank/black regions if enabled
+            if filter_blank_regions:
+                masked_region = gray_image[mask]
+                if len(masked_region) > 0:
+                    mean_intensity = float(np.mean(masked_region))
+                    if mean_intensity < 30:
+                        variance = float(np.var(masked_region))
+                        if variance < 100:
+                            continue  # Skip blank/black regions
+                    elif mean_intensity < 50:
+                        variance = float(np.var(masked_region))
+                        if variance < 50:
+                            continue  # Skip very uniform dark regions
+            # Convert boolean mask to uint8 format
+            mask_uint8 = (mask.astype(np.uint8) * 255)
+            # Convert mask to polygon with proper scaling (matching predict_polygon_from_point)
+            # scale_factors should represent FROM processed image TO display size
+            # mask_to_polygon divides by scale_factors to convert FROM processed TO display
+            polygon = mask_to_polygon(mask_uint8, scale_factors=scale_factors)
+            results.append({
+                "polygon": polygon,  # Flattened format [x1, y1, x2, y2, ...] (scaled to display size)
+                "confidence": score,
+                "area": area
+            })
+        return results
+def create_sam2_auto_annotation(
+    points_per_side: int = 32,
+    points_per_batch: int = 64,
+    pred_iou_thresh: float = 0.88,
+    stability_score_thresh: float = 0.95,
+    min_mask_region_area: int = 100,
+) -> SAM2AutoAnnotation:
+    """
+    Factory function to create a SAM2 Auto Annotation instance.
+    Args:
+        points_per_side: Number of points per side of the image grid
+        points_per_batch: Number of points to process in each batch
+        pred_iou_thresh: Prediction IoU threshold
+        stability_score_thresh: Stability score threshold
+        min_mask_region_area: Minimum mask region area in pixels
+    Returns:
+        SAM2AutoAnnotation instance
+    """
+    return SAM2AutoAnnotation(
+        points_per_side=points_per_side,
+        points_per_batch=points_per_batch,
+        pred_iou_thresh=pred_iou_thresh,
+        stability_score_thresh=stability_score_thresh,
+        min_mask_region_area=min_mask_region_area,
+    )

app/sam_model.py ADDED Viewed

	@@ -0,0 +1,550 @@

+import torch
+import numpy as np
+import cv2
+import psutil
+import os
+import sys
+# Add sam2 folder to path to import from local sam2 directory
+_current_file_dir = os.path.dirname(os.path.abspath(__file__))
+_project_root = os.path.dirname(_current_file_dir)
+_sam2_repo_dir = os.path.join(_project_root, "sam2")
+# Add sam2 directory to sys.path if not already there
+abs_sam2_dir = os.path.abspath(_sam2_repo_dir)
+if abs_sam2_dir not in sys.path:
+    sys.path.insert(0, abs_sam2_dir)
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
+from app.utils import mask_to_polygon
+# Hugging Face model ID for SAM2.1 Hiera Large model
+# Available models: facebook/sam2.1-hiera-tiny, facebook/sam2.1-hiera-small,
+# facebook/sam2.1-hiera-base, facebook/sam2.1-hiera-large
+HUGGINGFACE_MODEL_ID = "facebook/sam2.1-hiera-large"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Initialize SAM2 model (will be loaded on first use)
+predictor = None
+mask_generator = None
+def initialize_sam():
+    """
+    Initialize SAM2 Large model from Hugging Face if not already loaded.
+    Returns:
+        SAM2ImagePredictor instance
+    Raises:
+        ImportError: If sam2 or huggingface_hub is not installed
+        RuntimeError: If model fails to load from Hugging Face
+    """
+    global predictor
+    if predictor is None:
+        try:
+            # Load model directly from Hugging Face Hub
+            # This will automatically download the model if not cached locally
+            predictor = SAM2ImagePredictor.from_pretrained(
+                HUGGINGFACE_MODEL_ID,
+                device=device
+            )
+        except ImportError as e:
+            raise ImportError(
+                f"Failed to import required modules. Please ensure 'sam2' and 'huggingface_hub' are installed. "
+                f"Install with: pip install segment-anything huggingface_hub. "
+                f"Error: {str(e)}"
+            )
+        except Exception as e:
+            error_msg = str(e)
+            raise RuntimeError(
+                f"Failed to load SAM2 model from Hugging Face ({HUGGINGFACE_MODEL_ID}). "
+                f"Please check your internet connection and ensure the model ID is correct. "
+                f"Error: {error_msg}"
+            )
+    return predictor
+def initialize_mask_generator(points_per_side=32, points_per_batch=64):
+    """
+    Initialize SAM2 Automatic Mask Generator from Hugging Face if not already loaded.
+    Configured with memory-efficient parameters for CPU usage.
+    Args:
+        points_per_side: Number of points per side of the image grid (default: 32, lower = less memory)
+        points_per_batch: Number of points to process in each batch (default: 64, lower = less memory)
+    Returns:
+        SAM2AutomaticMaskGenerator instance
+    Raises:
+        ImportError: If sam2 or huggingface_hub is not installed
+        RuntimeError: If model fails to load from Hugging Face
+    """
+    global mask_generator
+    if mask_generator is None:
+        try:
+            # Try to load with configuration parameters first
+            try:
+                mask_generator = SAM2AutomaticMaskGenerator.from_pretrained(
+                    HUGGINGFACE_MODEL_ID,
+                    device=device,
+                    points_per_side=points_per_side,
+                    points_per_batch=points_per_batch,
+                    pred_iou_thresh=0.88,
+                    stability_score_thresh=0.95,
+                    crop_n_layers=1,
+                    crop_n_points_downscale_factor=2,
+                    min_mask_region_area=100,
+                )
+            except TypeError:
+                # If parameters are not accepted by from_pretrained, load without them
+                # and configure manually if possible
+                mask_generator = SAM2AutomaticMaskGenerator.from_pretrained(
+                    HUGGINGFACE_MODEL_ID,
+                    device=device
+                )
+                # Try to set parameters if the generator supports it
+                if hasattr(mask_generator, 'points_per_side'):
+                    mask_generator.points_per_side = points_per_side
+                if hasattr(mask_generator, 'points_per_batch'):
+                    mask_generator.points_per_batch = points_per_batch
+        except ImportError as e:
+            raise ImportError(
+                f"Failed to import required modules. Please ensure 'sam2' and 'huggingface_hub' are installed. "
+                f"Install with: pip install segment-anything huggingface_hub. "
+                f"Error: {str(e)}"
+            )
+        except Exception as e:
+            error_msg = str(e)
+            raise RuntimeError(
+                f"Failed to load SAM2 Automatic Mask Generator from Hugging Face ({HUGGINGFACE_MODEL_ID}). "
+                f"Please check your internet connection and ensure the model ID is correct. "
+                f"Error: {error_msg}"
+            )
+    return mask_generator
+def resize_image_if_needed(image_rgb, max_dimension=1024):
+    """
+    Resize image if it exceeds max_dimension to reduce memory usage.
+    Maintains aspect ratio.
+    Args:
+        image_rgb: numpy array (H, W, 3) in RGB format
+        max_dimension: Maximum dimension (width or height) in pixels (default: 1024)
+    Returns:
+        resized_image: Resized numpy array
+        scale_factor: Tuple (scale_x, scale_y) - how much the image was scaled down
+    """
+    h, w = image_rgb.shape[:2]
+    max_current = max(h, w)
+    if max_current <= max_dimension:
+        return image_rgb, (1.0, 1.0)
+    # Calculate new dimensions maintaining aspect ratio
+    if h > w:
+        new_h = max_dimension
+        new_w = int(w * (max_dimension / h))
+    else:
+        new_w = max_dimension
+        new_h = int(h * (max_dimension / w))
+    # Resize image
+    resized = cv2.resize(image_rgb, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+    scale_x = w / new_w if new_w > 0 else 1.0
+    scale_y = h / new_h if new_h > 0 else 1.0
+    return resized, (scale_x, scale_y)
+def calculate_memory_usage():
+    """
+    Calculate current memory usage of the process.
+    Returns:
+        dict: Memory usage information in MB
+    """
+    process = psutil.Process(os.getpid())
+    mem_info = process.memory_info()
+    return {
+        "rss_mb": mem_info.rss / (1024 * 1024),  # Resident Set Size in MB
+        "vms_mb": mem_info.vms / (1024 * 1024),  # Virtual Memory Size in MB
+        "percent": process.memory_percent()  # Percentage of system memory
+    }
+def estimate_image_memory(image_rgb):
+    """
+    Estimate memory required for processing an image.
+    Args:
+        image_rgb: numpy array (H, W, 3) in RGB format
+    Returns:
+        dict: Estimated memory usage in MB
+    """
+    h, w = image_rgb.shape[:2]
+    # Estimate memory for:
+    # - Input image: H * W * 3 * 4 bytes (float32)
+    # - Feature maps: ~H * W * 256 * 4 bytes (typical SAM2 feature size)
+    # - Masks: ~H * W * 100 * 1 byte (assuming ~100 masks)
+    # - Model weights: ~2-4 GB (loaded once)
+    image_memory_mb = (h * w * 3 * 4) / (1024 * 1024)
+    feature_memory_mb = (h * w * 256 * 4) / (1024 * 1024)
+    masks_memory_mb = (h * w * 100 * 1) / (1024 * 1024)
+    total_estimated_mb = image_memory_mb + feature_memory_mb + masks_memory_mb
+    return {
+        "image_mb": image_memory_mb,
+        "features_mb": feature_memory_mb,
+        "masks_mb": masks_memory_mb,
+        "total_estimated_mb": total_estimated_mb,
+        "image_size": f"{w}x{h}"
+    }
+def generate_all_masks(image_rgb, image_size=None, min_area=100, min_confidence=0.5, max_image_dimension=1024, points_per_side=32, points_per_batch=64):
+    """
+    Generate all possible object masks in an image using SAM2 Automatic Mask Generator.
+    Automatically detects and segments all objects without requiring prompts.
+    Optimized for CPU usage with image resizing and memory-efficient parameters.
+    Args:
+        image_rgb: numpy array (H, W, 3) in RGB format
+        image_size: Optional dict with "width" and "height" for coordinate scaling
+        min_area: Minimum mask area to filter out small/noisy masks (default: 100)
+        min_confidence: Minimum confidence score to filter masks (default: 0.5)
+        max_image_dimension: Maximum dimension (width or height) in pixels before resizing (default: 1024)
+        points_per_side: Number of points per side of the image grid (default: 32, lower = less memory)
+        points_per_batch: Number of points to process in each batch (default: 64, lower = less memory)
+    Returns:
+        dict: Contains:
+            - masks: List of dicts, each containing:
+                - polygon: flattened coordinates array [x1, y1, x2, y2, ...]
+                - confidence: float confidence score
+                - area: int mask area in pixels
+            - memory_info: Memory usage information
+            - was_resized: Whether the image was resized
+            - original_size: Original image dimensions
+            - processed_size: Processed image dimensions
+    """
+    # Get memory before processing
+    memory_before = calculate_memory_usage()
+    # Store original dimensions
+    original_h, original_w = image_rgb.shape[:2]
+    original_size = (original_w, original_h)
+    # Resize image if needed to reduce memory usage
+    processed_image, resize_scale = resize_image_if_needed(image_rgb, max_dimension=max_image_dimension)
+    was_resized = resize_scale[0] != 1.0 or resize_scale[1] != 1.0
+    processed_h, processed_w = processed_image.shape[:2]
+    processed_size = (processed_w, processed_h)
+    # Estimate memory requirements
+    memory_estimate = estimate_image_memory(processed_image)
+    # Initialize generator with memory-efficient parameters
+    generator = initialize_mask_generator(points_per_side=points_per_side, points_per_batch=points_per_batch)
+    # Calculate scale factors for coordinate scaling
+    scale_x, scale_y = 1.0, 1.0
+    if image_size is not None:
+        if isinstance(image_size, dict):
+            display_w = float(image_size.get("width", original_w))
+            display_h = float(image_size.get("height", original_h))
+        else:
+            display_w, display_h = float(image_size[0]), float(image_size[1])
+        # Calculate scale factors: how much to scale FROM display TO processed image
+        # Account for both resize_scale and image_size scale
+        scale_x = (processed_w / display_w) * resize_scale[0] if display_w > 0 else resize_scale[0]
+        scale_y = (processed_h / display_h) * resize_scale[1] if display_h > 0 else resize_scale[1]
+    else:
+        # Only account for resize scale
+        scale_x = resize_scale[0]
+        scale_y = resize_scale[1]
+    # Generate all masks automatically
+    masks = generator.generate(processed_image)
+    # Get memory after processing
+    memory_after = calculate_memory_usage()
+    # Process each mask and convert to polygon format
+    result_masks = []
+    for mask_data in masks:
+        # Extract mask information
+        mask = mask_data["segmentation"]  # Boolean mask
+        confidence = float(mask_data.get("stability_score", mask_data.get("predicted_iou", 0.0)))
+        area = int(mask_data.get("area", 0))
+        # Filter masks by area and confidence
+        if area < min_area or confidence < min_confidence:
+            continue
+        # Convert boolean mask to uint8 format for polygon conversion
+        mask_uint8 = (mask.astype(np.uint8) * 255)
+        # Convert mask to polygon using existing utility function
+        # Note: scale_factors are inverted here because mask_to_polygon expects
+        # scaling FROM processed TO display, but we calculated FROM display TO processed
+        polygon = mask_to_polygon(mask_uint8, (1.0/scale_x if scale_x != 0 else 1.0, 1.0/scale_y if scale_y != 0 else 1.0))
+        if polygon and len(polygon) >= 6:  # At least 3 points (x, y pairs)
+            result_masks.append({
+                "polygon": polygon,
+                "confidence": confidence,
+                "area": area
+            })
+    # Sort by area (largest first) for better usability
+    result_masks.sort(key=lambda x: x["area"], reverse=True)
+    return {
+        "masks": result_masks,
+        "memory_info": {
+            "before_mb": memory_before["rss_mb"],
+            "after_mb": memory_after["rss_mb"],
+            "peak_mb": memory_after["rss_mb"],
+            "estimated_mb": memory_estimate["total_estimated_mb"],
+            "memory_used_mb": memory_after["rss_mb"] - memory_before["rss_mb"]
+        },
+        "was_resized": was_resized,
+        "original_size": original_size,
+        "processed_size": processed_size,
+        "resize_scale": resize_scale
+    }
+def predict_polygon(image_rgb, bbox, image_size=None):
+    """
+    Predict polygon mask using SAM2 with bbox as prompt (CVAT-style).
+    Bbox is used to identify the object, not constrain it.
+    Args:
+        image_rgb: numpy array (H, W, 3) in RGB format
+        bbox: dict with keys "x", "y", "width", "height" OR list [x, y, w, h]
+        image_size: Optional dict with "width" and "height" for coordinate scaling
+    Returns:
+        mask: binary mask (numpy array) - full object shape, NOT clipped to bbox
+        confidence: float confidence score
+    """
+    predictor = initialize_sam()
+    predictor.set_image(image_rgb)
+    # Handle both dict and list formats for bbox
+    if isinstance(bbox, dict):
+        x = float(bbox["x"])
+        y = float(bbox["y"])
+        bbox_w = float(bbox["width"])
+        bbox_h = float(bbox["height"])
+    else:  # list format [x, y, w, h]
+        x, y, bbox_w, bbox_h = [float(v) for v in bbox]
+    # Scale bbox coordinates if image_size is provided (CVAT-style)
+    # image_size represents the display size (like CVAT UI), bbox is relative to display size
+    # We need to scale bbox FROM display size TO original image size for prediction
+    scale_x, scale_y = 1.0, 1.0
+    original_h, original_w = image_rgb.shape[:2]
+    if image_size is not None:
+        if isinstance(image_size, dict):
+            display_w = float(image_size.get("width", original_w))
+            display_h = float(image_size.get("height", original_h))
+        else:
+            display_w, display_h = float(image_size[0]), float(image_size[1])
+        # Calculate scale factors: how much to scale FROM display TO original
+        scale_x = original_w / display_w if display_w > 0 else 1.0
+        scale_y = original_h / display_h if display_h > 0 else 1.0
+        # Scale bbox coordinates FROM display size TO original image size
+        x = x * scale_x
+        y = y * scale_y
+        bbox_w = bbox_w * scale_x
+        bbox_h = bbox_h * scale_y
+    # Convert to [x1, y1, x2, y2] format for SAM2
+    box = np.array([x, y, x + bbox_w, y + bbox_h], dtype=np.float32)
+    # Use multiple point prompts (CVAT-style) for better object identification
+    # Center point + corner points help SAM2 capture the full object
+    center_x = x + bbox_w / 2.0
+    center_y = y + bbox_h / 2.0
+    # Add multiple foreground points: center + corners (helps capture full object)
+    point_coords = np.array([
+        [center_x, center_y],           # Center
+        [x + bbox_w * 0.25, y + bbox_h * 0.25],  # Top-left quarter
+        [x + bbox_w * 0.75, y + bbox_h * 0.25],  # Top-right quarter
+        [x + bbox_w * 0.25, y + bbox_h * 0.75],  # Bottom-left quarter
+        [x + bbox_w * 0.75, y + bbox_h * 0.75],  # Bottom-right quarter
+    ], dtype=np.float32)
+    point_labels = np.array([1, 1, 1, 1, 1], dtype=np.int32)  # All foreground points
+    # Get multiple masks and select the best one (like CVAT)
+    masks, scores, _ = predictor.predict(
+        box=box,
+        point_coords=point_coords,
+        point_labels=point_labels,
+        multimask_output=True  # Get multiple masks to choose the best fit
+    )
+    # Select the best mask using multiple criteria (CVAT-style)
+    # Consider both confidence score AND coverage of bbox area
+    best_mask_idx = 0
+    best_score_combined = 0.0
+    bbox_area = bbox_w * bbox_h
+    for idx, (mask, score) in enumerate(zip(masks, scores)):
+        # Calculate mask area within bbox region
+        mask_binary = mask.astype(np.uint8) * 255
+        # Get mask area in bbox region
+        x1_int = max(0, int(x))
+        y1_int = max(0, int(y))
+        x2_int = min(mask.shape[1], int(x + bbox_w))
+        y2_int = min(mask.shape[0], int(y + bbox_h))
+        mask_bbox_region = mask_binary[y1_int:y2_int, x1_int:x2_int]
+        mask_area_in_bbox = np.sum(mask_bbox_region > 0)
+        # Calculate coverage ratio (how much of bbox is covered by mask)
+        coverage_ratio = mask_area_in_bbox / bbox_area if bbox_area > 0 else 0
+        # Combined score: confidence (60%) + coverage (40%)
+        # Higher coverage ensures we capture the full object
+        score_combined = float(score) * 0.6 + coverage_ratio * 0.4
+        if score_combined > best_score_combined:
+            best_score_combined = score_combined
+            best_mask_idx = idx
+    best_mask = masks[best_mask_idx]
+    best_score = scores[best_mask_idx]
+    # Post-process mask to fill holes and improve completeness (CVAT-style)
+    mask = (best_mask * 255).astype("uint8") if best_mask.dtype == bool else (best_mask * 255).astype("uint8")
+    # Fill small holes in the mask (CVAT-style post-processing)
+    # This helps capture parts that might be missing
+    mask_filled = cv2.morphologyEx(mask, cv2.MORPH_CLOSE,
+                                   cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5)))
+    # Fill holes using flood fill
+    h, w = mask_filled.shape
+    mask_floodfill = mask_filled.copy()
+    cv2.floodFill(mask_floodfill, None, (0, 0), 255)
+    mask_floodfill_inv = cv2.bitwise_not(mask_floodfill)
+    mask_filled = cv2.bitwise_or(mask_filled, mask_floodfill_inv)
+    # Use the filled mask for better completeness
+    mask = mask_filled
+    # Safely extract confidence score (handle numpy array/scalar)
+    score_arr = np.asarray(best_score).flatten()
+    confidence = float(score_arr[0])
+    return mask, confidence, (scale_x, scale_y)
+def predict_polygon_from_point(image_rgb, point, image_size=None):
+    """
+    Predict polygon mask using SAM2 with a point click as prompt.
+    The point identifies the object to segment.
+    Args:
+        image_rgb: numpy array (H, W, 3) in RGB format
+        point: dict with keys "x", "y" OR list [x, y] - the clicked point coordinate
+        image_size: Optional dict with "width" and "height" for coordinate scaling
+    Returns:
+        mask: binary mask (numpy array) - full object shape
+        confidence: float confidence score
+        scale_factors: tuple (scale_x, scale_y) for coordinate scaling
+    """
+    predictor = initialize_sam()
+    predictor.set_image(image_rgb)
+    # Handle both dict and list formats for point
+    if isinstance(point, dict):
+        point_x = float(point["x"])
+        point_y = float(point["y"])
+    else:  # list format [x, y]
+        point_x, point_y = [float(v) for v in point]
+    # Scale point coordinates if image_size is provided (CVAT-style)
+    # image_size represents the display size (like CVAT UI), point is relative to display size
+    # We need to scale point FROM display size TO original image size for prediction
+    scale_x, scale_y = 1.0, 1.0
+    original_h, original_w = image_rgb.shape[:2]
+    if image_size is not None:
+        if isinstance(image_size, dict):
+            display_w = float(image_size.get("width", original_w))
+            display_h = float(image_size.get("height", original_h))
+        else:
+            display_w, display_h = float(image_size[0]), float(image_size[1])
+        # Calculate scale factors: how much to scale FROM display TO original
+        scale_x = original_w / display_w if display_w > 0 else 1.0
+        scale_y = original_h / display_h if display_h > 0 else 1.0
+        # Scale point coordinates FROM display size TO original image size
+        point_x = point_x * scale_x
+        point_y = point_y * scale_y
+    # Prepare point coordinates for SAM2
+    # point_coords shape: (1, 2) - single point
+    point_coords = np.array([[point_x, point_y]], dtype=np.float32)
+    point_labels = np.array([1], dtype=np.int32)  # 1 = foreground point
+    # Get multiple masks and select the best one
+    masks, scores, _ = predictor.predict(
+        point_coords=point_coords,
+        point_labels=point_labels,
+        multimask_output=True  # Get multiple masks to choose the best fit
+    )
+    # Select the best mask based on confidence score
+    best_mask_idx = np.argmax(scores)
+    best_mask = masks[best_mask_idx]
+    best_score = scores[best_mask_idx]
+    # Post-process mask to fill holes and improve completeness (CVAT-style)
+    mask = (best_mask * 255).astype("uint8") if best_mask.dtype == bool else (best_mask * 255).astype("uint8")
+    # Fill small holes in the mask (CVAT-style post-processing)
+    # This helps capture parts that might be missing
+    mask_filled = cv2.morphologyEx(mask, cv2.MORPH_CLOSE,
+                                   cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5)))
+    # Fill holes using flood fill
+    h, w = mask_filled.shape
+    mask_floodfill = mask_filled.copy()
+    cv2.floodFill(mask_floodfill, None, (0, 0), 255)
+    mask_floodfill_inv = cv2.bitwise_not(mask_floodfill)
+    mask_filled = cv2.bitwise_or(mask_filled, mask_floodfill_inv)
+    # Use the filled mask for better completeness
+    mask = mask_filled
+    # Safely extract confidence score (handle numpy array/scalar)
+    score_arr = np.asarray(best_score).flatten()
+    confidence = float(score_arr[0])
+    return mask, confidence, (scale_x, scale_y)

app/utils.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import cv2
+import numpy as np
+import requests
+from skimage import measure
+def load_image_from_url(url: str):
+    """
+    Load image from URL and return as BGR numpy array.
+    Args:
+        url: Image URL string
+    Returns:
+        BGR image as numpy array
+    Raises:
+        ValueError: If image cannot be decoded
+        requests.RequestException: If URL request fails
+    """
+    response = requests.get(url, timeout=10)
+    response.raise_for_status()
+    img = cv2.imdecode(
+        np.frombuffer(response.content, np.uint8),
+        cv2.IMREAD_COLOR
+    )
+    if img is None:
+        raise ValueError(f"Failed to decode image from URL: {url}")
+    return img
+def mask_to_polygon(mask, scale_factors=(1.0, 1.0)):
+    """
+    Convert binary mask to polygon coordinates (CVAT-style).
+    Uses cv2.findContours and cv2.approxPolyDP like CVAT does.
+    Includes post-processing to ensure complete polygon coverage.
+    Args:
+        mask: Binary mask (numpy array, uint8, 0 or 255)
+        scale_factors: Tuple (scale_x, scale_y) to scale coordinates FROM original TO display size
+    Returns:
+        List of coordinates in CVAT format: [x1, y1, x2, y2, x3, y3, ...]
+    """
+    scale_x, scale_y = scale_factors
+    # Convert mask to binary format for cv2.findContours
+    if mask.dtype != np.uint8:
+        mask = mask.astype(np.uint8)
+    # Ensure binary mask (0 or 255)
+    if mask.max() > 1:
+        mask = (mask > 127).astype(np.uint8) * 255
+    # Additional smoothing to ensure complete coverage (CVAT-style)
+    # Small morphological closing to connect nearby regions
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
+    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=1)
+    # Find contours (CVAT-style)
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    if not contours:
+        return []
+    # Get the largest contour by area (most accurate for object shape)
+    largest_contour = max(contours, key=cv2.contourArea)
+    # Approximate polygon (CVAT-style, epsilon=1.0)
+    # Using epsilon relative to contour perimeter for better accuracy
+    epsilon = max(1.0, cv2.arcLength(largest_contour, True) * 0.001)  # Adaptive epsilon
+    approx_contour = cv2.approxPolyDP(largest_contour, epsilon=epsilon, closed=True)
+    if approx_contour.shape[0] < 3:
+        return []
+    # Flatten and convert to list
+    polygon = approx_contour.reshape(-1, 2).astype(float)
+    # Scale coordinates FROM original image size TO display size (inverse of bbox scaling)
+    # If scale_x > 1, original is larger than display, so we divide
+    # If scale_x < 1, original is smaller than display, so we divide (still correct)
+    if scale_x != 1.0 or scale_y != 1.0:
+        polygon[:, 0] = polygon[:, 0] / scale_x  # x coordinates: original -> display
+        polygon[:, 1] = polygon[:, 1] / scale_y  # y coordinates: original -> display
+    # Flatten to CVAT format: [x1, y1, x2, y2, ...]
+    return polygon.flatten().tolist()