sumitsingh830 commited on
Commit
2aeb7a5
·
verified ·
1 Parent(s): 39d0c75

Upload 8 files

Browse files
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # App package
app/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (145 Bytes). View file
 
app/__pycache__/sam2_detection_function.cpython-313.pyc ADDED
Binary file (7.93 kB). View file
 
app/__pycache__/sam_model.cpython-313.pyc ADDED
Binary file (18.9 kB). View file
 
app/__pycache__/utils.cpython-313.pyc ADDED
Binary file (3.51 kB). View file
 
app/sam2_detection_function.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ import torch
4
+ import sys
5
+ import os
6
+
7
+ # Add sam2 folder to path to import from local sam2 directory
8
+ _current_file_dir = os.path.dirname(os.path.abspath(__file__))
9
+ _project_root = os.path.dirname(_current_file_dir)
10
+ _sam2_repo_dir = os.path.join(_project_root, "sam2")
11
+ # Add sam2 directory to sys.path if not already there
12
+ abs_sam2_dir = os.path.abspath(_sam2_repo_dir)
13
+ if abs_sam2_dir not in sys.path:
14
+ sys.path.insert(0, abs_sam2_dir)
15
+
16
+ from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
17
+
18
+ from app.utils import mask_to_polygon
19
+
20
+ # Hugging Face model ID for SAM2.1 Hiera Large model
21
+ HUGGINGFACE_MODEL_ID = "facebook/sam2.1-hiera-large"
22
+ device = "cuda" if torch.cuda.is_available() else "cpu"
23
+
24
+
25
+ class SAM2AutoAnnotation:
26
+ """
27
+ SAM2 Auto Annotation wrapper for automatically generating masks for all objects in an image.
28
+ Uses SAM2AutomaticMaskGenerator from Hugging Face.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ points_per_side: int = 32,
34
+ points_per_batch: int = 64,
35
+ pred_iou_thresh: float = 0.88,
36
+ stability_score_thresh: float = 0.95,
37
+ min_mask_region_area: int = 100,
38
+ ):
39
+ """
40
+ Initialize SAM2 Auto Annotation.
41
+
42
+ Args:
43
+ points_per_side: Number of points per side of the image grid
44
+ points_per_batch: Number of points to process in each batch
45
+ pred_iou_thresh: Prediction IoU threshold
46
+ stability_score_thresh: Stability score threshold
47
+ min_mask_region_area: Minimum mask region area in pixels
48
+ """
49
+ self.points_per_side = points_per_side
50
+ self.points_per_batch = points_per_batch
51
+ self.pred_iou_thresh = pred_iou_thresh
52
+ self.stability_score_thresh = stability_score_thresh
53
+ self.min_mask_region_area = min_mask_region_area
54
+ self._mask_generator = None
55
+
56
+ def _get_mask_generator(self):
57
+ """Lazy initialization of mask generator."""
58
+ if self._mask_generator is None:
59
+ try:
60
+ # Try to load with configuration parameters first
61
+ try:
62
+ self._mask_generator = SAM2AutomaticMaskGenerator.from_pretrained(
63
+ HUGGINGFACE_MODEL_ID,
64
+ device=device,
65
+ points_per_side=self.points_per_side,
66
+ points_per_batch=self.points_per_batch,
67
+ pred_iou_thresh=self.pred_iou_thresh,
68
+ stability_score_thresh=self.stability_score_thresh,
69
+ crop_n_layers=1,
70
+ crop_n_points_downscale_factor=2,
71
+ min_mask_region_area=self.min_mask_region_area,
72
+ )
73
+ except TypeError:
74
+ # If parameters are not accepted by from_pretrained, load without them
75
+ self._mask_generator = SAM2AutomaticMaskGenerator.from_pretrained(
76
+ HUGGINGFACE_MODEL_ID,
77
+ device=device
78
+ )
79
+ # Try to set parameters if the generator supports it
80
+ for attr_name in ['points_per_side', 'points_per_batch', 'pred_iou_thresh',
81
+ 'stability_score_thresh', 'min_mask_region_area']:
82
+ if hasattr(self._mask_generator, attr_name):
83
+ setattr(self._mask_generator, attr_name, getattr(self, attr_name))
84
+ except ImportError as e:
85
+ raise RuntimeError(
86
+ f"Failed to import required modules for SAM2. Please ensure 'sam2' and 'huggingface_hub' are installed. "
87
+ f"Error: {str(e)}"
88
+ )
89
+ except Exception as e:
90
+ raise RuntimeError(
91
+ f"Failed to load SAM2 Automatic Mask Generator from Hugging Face ({HUGGINGFACE_MODEL_ID}). "
92
+ f"Please check your internet connection and ensure the model ID is correct. "
93
+ f"Error: {str(e)}"
94
+ )
95
+ return self._mask_generator
96
+
97
+ def generate_masks(
98
+ self,
99
+ image: np.ndarray,
100
+ min_confidence: float = 0.0,
101
+ min_area: int = None,
102
+ filter_blank_regions: bool = True,
103
+ scale_factors: tuple = (1.0, 1.0),
104
+ ) -> list:
105
+ """
106
+ Generate all masks for objects in the image.
107
+
108
+ Args:
109
+ image: Image as numpy array (RGB format, H, W, 3)
110
+ min_confidence: Minimum confidence score to filter masks (default: 0.0)
111
+ min_area: Minimum mask area in pixels (default: uses self.min_mask_region_area)
112
+ filter_blank_regions: Filter out blank/black regions (default: True)
113
+ scale_factors: Tuple (scale_x, scale_y) to scale coordinates FROM processed TO display size
114
+ (matching predict_polygon_from_point logic)
115
+
116
+ Returns:
117
+ List of mask dictionaries, each containing:
118
+ - polygon: flattened coordinates [x1, y1, x2, y2, ...] (scaled to display size)
119
+ - confidence: confidence score
120
+ - area: mask area in pixels
121
+ """
122
+ if min_area is None:
123
+ min_area = self.min_mask_region_area
124
+
125
+ # Get mask generator
126
+ mask_generator = self._get_mask_generator()
127
+
128
+ # Generate all masks automatically
129
+ masks = mask_generator.generate(image)
130
+
131
+ # Convert image to grayscale for blank region detection
132
+ if filter_blank_regions:
133
+ if len(image.shape) == 3:
134
+ gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
135
+ else:
136
+ gray_image = image
137
+
138
+ # Process masks and convert to polygons
139
+ results = []
140
+ for mask_data in masks:
141
+ # Extract mask information
142
+ mask = mask_data["segmentation"] # Boolean mask
143
+ score = float(mask_data.get("stability_score", mask_data.get("predicted_iou", 0.0)))
144
+ area = int(mask_data.get("area", 0))
145
+
146
+ # Filter by confidence threshold
147
+ if score < min_confidence:
148
+ continue
149
+
150
+ # Filter by minimum area
151
+ if area < min_area:
152
+ continue
153
+
154
+ # Filter blank/black regions if enabled
155
+ if filter_blank_regions:
156
+ masked_region = gray_image[mask]
157
+ if len(masked_region) > 0:
158
+ mean_intensity = float(np.mean(masked_region))
159
+ if mean_intensity < 30:
160
+ variance = float(np.var(masked_region))
161
+ if variance < 100:
162
+ continue # Skip blank/black regions
163
+ elif mean_intensity < 50:
164
+ variance = float(np.var(masked_region))
165
+ if variance < 50:
166
+ continue # Skip very uniform dark regions
167
+
168
+ # Convert boolean mask to uint8 format
169
+ mask_uint8 = (mask.astype(np.uint8) * 255)
170
+
171
+ # Convert mask to polygon with proper scaling (matching predict_polygon_from_point)
172
+ # scale_factors should represent FROM processed image TO display size
173
+ # mask_to_polygon divides by scale_factors to convert FROM processed TO display
174
+ polygon = mask_to_polygon(mask_uint8, scale_factors=scale_factors)
175
+
176
+ results.append({
177
+ "polygon": polygon, # Flattened format [x1, y1, x2, y2, ...] (scaled to display size)
178
+ "confidence": score,
179
+ "area": area
180
+ })
181
+
182
+ return results
183
+
184
+
185
+ def create_sam2_auto_annotation(
186
+ points_per_side: int = 32,
187
+ points_per_batch: int = 64,
188
+ pred_iou_thresh: float = 0.88,
189
+ stability_score_thresh: float = 0.95,
190
+ min_mask_region_area: int = 100,
191
+ ) -> SAM2AutoAnnotation:
192
+ """
193
+ Factory function to create a SAM2 Auto Annotation instance.
194
+
195
+ Args:
196
+ points_per_side: Number of points per side of the image grid
197
+ points_per_batch: Number of points to process in each batch
198
+ pred_iou_thresh: Prediction IoU threshold
199
+ stability_score_thresh: Stability score threshold
200
+ min_mask_region_area: Minimum mask region area in pixels
201
+
202
+ Returns:
203
+ SAM2AutoAnnotation instance
204
+ """
205
+ return SAM2AutoAnnotation(
206
+ points_per_side=points_per_side,
207
+ points_per_batch=points_per_batch,
208
+ pred_iou_thresh=pred_iou_thresh,
209
+ stability_score_thresh=stability_score_thresh,
210
+ min_mask_region_area=min_mask_region_area,
211
+ )
212
+
app/sam_model.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import cv2
4
+ import psutil
5
+ import os
6
+ import sys
7
+
8
+ # Add sam2 folder to path to import from local sam2 directory
9
+ _current_file_dir = os.path.dirname(os.path.abspath(__file__))
10
+ _project_root = os.path.dirname(_current_file_dir)
11
+ _sam2_repo_dir = os.path.join(_project_root, "sam2")
12
+ # Add sam2 directory to sys.path if not already there
13
+ abs_sam2_dir = os.path.abspath(_sam2_repo_dir)
14
+ if abs_sam2_dir not in sys.path:
15
+ sys.path.insert(0, abs_sam2_dir)
16
+
17
+ from sam2.sam2_image_predictor import SAM2ImagePredictor
18
+ from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
19
+ from app.utils import mask_to_polygon
20
+
21
+ # Hugging Face model ID for SAM2.1 Hiera Large model
22
+ # Available models: facebook/sam2.1-hiera-tiny, facebook/sam2.1-hiera-small,
23
+ # facebook/sam2.1-hiera-base, facebook/sam2.1-hiera-large
24
+ HUGGINGFACE_MODEL_ID = "facebook/sam2.1-hiera-large"
25
+
26
+ device = "cuda" if torch.cuda.is_available() else "cpu"
27
+
28
+ # Initialize SAM2 model (will be loaded on first use)
29
+ predictor = None
30
+ mask_generator = None
31
+
32
+
33
+ def initialize_sam():
34
+ """
35
+ Initialize SAM2 Large model from Hugging Face if not already loaded.
36
+
37
+ Returns:
38
+ SAM2ImagePredictor instance
39
+
40
+ Raises:
41
+ ImportError: If sam2 or huggingface_hub is not installed
42
+ RuntimeError: If model fails to load from Hugging Face
43
+ """
44
+ global predictor
45
+ if predictor is None:
46
+ try:
47
+ # Load model directly from Hugging Face Hub
48
+ # This will automatically download the model if not cached locally
49
+ predictor = SAM2ImagePredictor.from_pretrained(
50
+ HUGGINGFACE_MODEL_ID,
51
+ device=device
52
+ )
53
+ except ImportError as e:
54
+ raise ImportError(
55
+ f"Failed to import required modules. Please ensure 'sam2' and 'huggingface_hub' are installed. "
56
+ f"Install with: pip install segment-anything huggingface_hub. "
57
+ f"Error: {str(e)}"
58
+ )
59
+ except Exception as e:
60
+ error_msg = str(e)
61
+ raise RuntimeError(
62
+ f"Failed to load SAM2 model from Hugging Face ({HUGGINGFACE_MODEL_ID}). "
63
+ f"Please check your internet connection and ensure the model ID is correct. "
64
+ f"Error: {error_msg}"
65
+ )
66
+ return predictor
67
+
68
+
69
+ def initialize_mask_generator(points_per_side=32, points_per_batch=64):
70
+ """
71
+ Initialize SAM2 Automatic Mask Generator from Hugging Face if not already loaded.
72
+ Configured with memory-efficient parameters for CPU usage.
73
+
74
+ Args:
75
+ points_per_side: Number of points per side of the image grid (default: 32, lower = less memory)
76
+ points_per_batch: Number of points to process in each batch (default: 64, lower = less memory)
77
+
78
+ Returns:
79
+ SAM2AutomaticMaskGenerator instance
80
+
81
+ Raises:
82
+ ImportError: If sam2 or huggingface_hub is not installed
83
+ RuntimeError: If model fails to load from Hugging Face
84
+ """
85
+ global mask_generator
86
+ if mask_generator is None:
87
+ try:
88
+ # Try to load with configuration parameters first
89
+ try:
90
+ mask_generator = SAM2AutomaticMaskGenerator.from_pretrained(
91
+ HUGGINGFACE_MODEL_ID,
92
+ device=device,
93
+ points_per_side=points_per_side,
94
+ points_per_batch=points_per_batch,
95
+ pred_iou_thresh=0.88,
96
+ stability_score_thresh=0.95,
97
+ crop_n_layers=1,
98
+ crop_n_points_downscale_factor=2,
99
+ min_mask_region_area=100,
100
+ )
101
+ except TypeError:
102
+ # If parameters are not accepted by from_pretrained, load without them
103
+ # and configure manually if possible
104
+ mask_generator = SAM2AutomaticMaskGenerator.from_pretrained(
105
+ HUGGINGFACE_MODEL_ID,
106
+ device=device
107
+ )
108
+ # Try to set parameters if the generator supports it
109
+ if hasattr(mask_generator, 'points_per_side'):
110
+ mask_generator.points_per_side = points_per_side
111
+ if hasattr(mask_generator, 'points_per_batch'):
112
+ mask_generator.points_per_batch = points_per_batch
113
+ except ImportError as e:
114
+ raise ImportError(
115
+ f"Failed to import required modules. Please ensure 'sam2' and 'huggingface_hub' are installed. "
116
+ f"Install with: pip install segment-anything huggingface_hub. "
117
+ f"Error: {str(e)}"
118
+ )
119
+ except Exception as e:
120
+ error_msg = str(e)
121
+ raise RuntimeError(
122
+ f"Failed to load SAM2 Automatic Mask Generator from Hugging Face ({HUGGINGFACE_MODEL_ID}). "
123
+ f"Please check your internet connection and ensure the model ID is correct. "
124
+ f"Error: {error_msg}"
125
+ )
126
+ return mask_generator
127
+
128
+
129
+ def resize_image_if_needed(image_rgb, max_dimension=1024):
130
+ """
131
+ Resize image if it exceeds max_dimension to reduce memory usage.
132
+ Maintains aspect ratio.
133
+
134
+ Args:
135
+ image_rgb: numpy array (H, W, 3) in RGB format
136
+ max_dimension: Maximum dimension (width or height) in pixels (default: 1024)
137
+
138
+ Returns:
139
+ resized_image: Resized numpy array
140
+ scale_factor: Tuple (scale_x, scale_y) - how much the image was scaled down
141
+ """
142
+ h, w = image_rgb.shape[:2]
143
+ max_current = max(h, w)
144
+
145
+ if max_current <= max_dimension:
146
+ return image_rgb, (1.0, 1.0)
147
+
148
+ # Calculate new dimensions maintaining aspect ratio
149
+ if h > w:
150
+ new_h = max_dimension
151
+ new_w = int(w * (max_dimension / h))
152
+ else:
153
+ new_w = max_dimension
154
+ new_h = int(h * (max_dimension / w))
155
+
156
+ # Resize image
157
+ resized = cv2.resize(image_rgb, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
158
+
159
+ scale_x = w / new_w if new_w > 0 else 1.0
160
+ scale_y = h / new_h if new_h > 0 else 1.0
161
+
162
+ return resized, (scale_x, scale_y)
163
+
164
+
165
+ def calculate_memory_usage():
166
+ """
167
+ Calculate current memory usage of the process.
168
+
169
+ Returns:
170
+ dict: Memory usage information in MB
171
+ """
172
+ process = psutil.Process(os.getpid())
173
+ mem_info = process.memory_info()
174
+
175
+ return {
176
+ "rss_mb": mem_info.rss / (1024 * 1024), # Resident Set Size in MB
177
+ "vms_mb": mem_info.vms / (1024 * 1024), # Virtual Memory Size in MB
178
+ "percent": process.memory_percent() # Percentage of system memory
179
+ }
180
+
181
+
182
+ def estimate_image_memory(image_rgb):
183
+ """
184
+ Estimate memory required for processing an image.
185
+
186
+ Args:
187
+ image_rgb: numpy array (H, W, 3) in RGB format
188
+
189
+ Returns:
190
+ dict: Estimated memory usage in MB
191
+ """
192
+ h, w = image_rgb.shape[:2]
193
+
194
+ # Estimate memory for:
195
+ # - Input image: H * W * 3 * 4 bytes (float32)
196
+ # - Feature maps: ~H * W * 256 * 4 bytes (typical SAM2 feature size)
197
+ # - Masks: ~H * W * 100 * 1 byte (assuming ~100 masks)
198
+ # - Model weights: ~2-4 GB (loaded once)
199
+
200
+ image_memory_mb = (h * w * 3 * 4) / (1024 * 1024)
201
+ feature_memory_mb = (h * w * 256 * 4) / (1024 * 1024)
202
+ masks_memory_mb = (h * w * 100 * 1) / (1024 * 1024)
203
+
204
+ total_estimated_mb = image_memory_mb + feature_memory_mb + masks_memory_mb
205
+
206
+ return {
207
+ "image_mb": image_memory_mb,
208
+ "features_mb": feature_memory_mb,
209
+ "masks_mb": masks_memory_mb,
210
+ "total_estimated_mb": total_estimated_mb,
211
+ "image_size": f"{w}x{h}"
212
+ }
213
+
214
+
215
+ def generate_all_masks(image_rgb, image_size=None, min_area=100, min_confidence=0.5, max_image_dimension=1024, points_per_side=32, points_per_batch=64):
216
+ """
217
+ Generate all possible object masks in an image using SAM2 Automatic Mask Generator.
218
+ Automatically detects and segments all objects without requiring prompts.
219
+ Optimized for CPU usage with image resizing and memory-efficient parameters.
220
+
221
+ Args:
222
+ image_rgb: numpy array (H, W, 3) in RGB format
223
+ image_size: Optional dict with "width" and "height" for coordinate scaling
224
+ min_area: Minimum mask area to filter out small/noisy masks (default: 100)
225
+ min_confidence: Minimum confidence score to filter masks (default: 0.5)
226
+ max_image_dimension: Maximum dimension (width or height) in pixels before resizing (default: 1024)
227
+ points_per_side: Number of points per side of the image grid (default: 32, lower = less memory)
228
+ points_per_batch: Number of points to process in each batch (default: 64, lower = less memory)
229
+
230
+ Returns:
231
+ dict: Contains:
232
+ - masks: List of dicts, each containing:
233
+ - polygon: flattened coordinates array [x1, y1, x2, y2, ...]
234
+ - confidence: float confidence score
235
+ - area: int mask area in pixels
236
+ - memory_info: Memory usage information
237
+ - was_resized: Whether the image was resized
238
+ - original_size: Original image dimensions
239
+ - processed_size: Processed image dimensions
240
+ """
241
+ # Get memory before processing
242
+ memory_before = calculate_memory_usage()
243
+
244
+ # Store original dimensions
245
+ original_h, original_w = image_rgb.shape[:2]
246
+ original_size = (original_w, original_h)
247
+
248
+ # Resize image if needed to reduce memory usage
249
+ processed_image, resize_scale = resize_image_if_needed(image_rgb, max_dimension=max_image_dimension)
250
+ was_resized = resize_scale[0] != 1.0 or resize_scale[1] != 1.0
251
+ processed_h, processed_w = processed_image.shape[:2]
252
+ processed_size = (processed_w, processed_h)
253
+
254
+ # Estimate memory requirements
255
+ memory_estimate = estimate_image_memory(processed_image)
256
+
257
+ # Initialize generator with memory-efficient parameters
258
+ generator = initialize_mask_generator(points_per_side=points_per_side, points_per_batch=points_per_batch)
259
+
260
+ # Calculate scale factors for coordinate scaling
261
+ scale_x, scale_y = 1.0, 1.0
262
+
263
+ if image_size is not None:
264
+ if isinstance(image_size, dict):
265
+ display_w = float(image_size.get("width", original_w))
266
+ display_h = float(image_size.get("height", original_h))
267
+ else:
268
+ display_w, display_h = float(image_size[0]), float(image_size[1])
269
+
270
+ # Calculate scale factors: how much to scale FROM display TO processed image
271
+ # Account for both resize_scale and image_size scale
272
+ scale_x = (processed_w / display_w) * resize_scale[0] if display_w > 0 else resize_scale[0]
273
+ scale_y = (processed_h / display_h) * resize_scale[1] if display_h > 0 else resize_scale[1]
274
+ else:
275
+ # Only account for resize scale
276
+ scale_x = resize_scale[0]
277
+ scale_y = resize_scale[1]
278
+
279
+ # Generate all masks automatically
280
+ masks = generator.generate(processed_image)
281
+
282
+ # Get memory after processing
283
+ memory_after = calculate_memory_usage()
284
+
285
+ # Process each mask and convert to polygon format
286
+ result_masks = []
287
+
288
+ for mask_data in masks:
289
+ # Extract mask information
290
+ mask = mask_data["segmentation"] # Boolean mask
291
+ confidence = float(mask_data.get("stability_score", mask_data.get("predicted_iou", 0.0)))
292
+ area = int(mask_data.get("area", 0))
293
+
294
+ # Filter masks by area and confidence
295
+ if area < min_area or confidence < min_confidence:
296
+ continue
297
+
298
+ # Convert boolean mask to uint8 format for polygon conversion
299
+ mask_uint8 = (mask.astype(np.uint8) * 255)
300
+
301
+ # Convert mask to polygon using existing utility function
302
+ # Note: scale_factors are inverted here because mask_to_polygon expects
303
+ # scaling FROM processed TO display, but we calculated FROM display TO processed
304
+ polygon = mask_to_polygon(mask_uint8, (1.0/scale_x if scale_x != 0 else 1.0, 1.0/scale_y if scale_y != 0 else 1.0))
305
+
306
+ if polygon and len(polygon) >= 6: # At least 3 points (x, y pairs)
307
+ result_masks.append({
308
+ "polygon": polygon,
309
+ "confidence": confidence,
310
+ "area": area
311
+ })
312
+
313
+ # Sort by area (largest first) for better usability
314
+ result_masks.sort(key=lambda x: x["area"], reverse=True)
315
+
316
+ return {
317
+ "masks": result_masks,
318
+ "memory_info": {
319
+ "before_mb": memory_before["rss_mb"],
320
+ "after_mb": memory_after["rss_mb"],
321
+ "peak_mb": memory_after["rss_mb"],
322
+ "estimated_mb": memory_estimate["total_estimated_mb"],
323
+ "memory_used_mb": memory_after["rss_mb"] - memory_before["rss_mb"]
324
+ },
325
+ "was_resized": was_resized,
326
+ "original_size": original_size,
327
+ "processed_size": processed_size,
328
+ "resize_scale": resize_scale
329
+ }
330
+
331
+
332
+ def predict_polygon(image_rgb, bbox, image_size=None):
333
+ """
334
+ Predict polygon mask using SAM2 with bbox as prompt (CVAT-style).
335
+ Bbox is used to identify the object, not constrain it.
336
+
337
+ Args:
338
+ image_rgb: numpy array (H, W, 3) in RGB format
339
+ bbox: dict with keys "x", "y", "width", "height" OR list [x, y, w, h]
340
+ image_size: Optional dict with "width" and "height" for coordinate scaling
341
+
342
+ Returns:
343
+ mask: binary mask (numpy array) - full object shape, NOT clipped to bbox
344
+ confidence: float confidence score
345
+ """
346
+ predictor = initialize_sam()
347
+ predictor.set_image(image_rgb)
348
+
349
+ # Handle both dict and list formats for bbox
350
+ if isinstance(bbox, dict):
351
+ x = float(bbox["x"])
352
+ y = float(bbox["y"])
353
+ bbox_w = float(bbox["width"])
354
+ bbox_h = float(bbox["height"])
355
+ else: # list format [x, y, w, h]
356
+ x, y, bbox_w, bbox_h = [float(v) for v in bbox]
357
+
358
+ # Scale bbox coordinates if image_size is provided (CVAT-style)
359
+ # image_size represents the display size (like CVAT UI), bbox is relative to display size
360
+ # We need to scale bbox FROM display size TO original image size for prediction
361
+ scale_x, scale_y = 1.0, 1.0
362
+ original_h, original_w = image_rgb.shape[:2]
363
+
364
+ if image_size is not None:
365
+ if isinstance(image_size, dict):
366
+ display_w = float(image_size.get("width", original_w))
367
+ display_h = float(image_size.get("height", original_h))
368
+ else:
369
+ display_w, display_h = float(image_size[0]), float(image_size[1])
370
+
371
+ # Calculate scale factors: how much to scale FROM display TO original
372
+ scale_x = original_w / display_w if display_w > 0 else 1.0
373
+ scale_y = original_h / display_h if display_h > 0 else 1.0
374
+
375
+ # Scale bbox coordinates FROM display size TO original image size
376
+ x = x * scale_x
377
+ y = y * scale_y
378
+ bbox_w = bbox_w * scale_x
379
+ bbox_h = bbox_h * scale_y
380
+
381
+ # Convert to [x1, y1, x2, y2] format for SAM2
382
+ box = np.array([x, y, x + bbox_w, y + bbox_h], dtype=np.float32)
383
+
384
+ # Use multiple point prompts (CVAT-style) for better object identification
385
+ # Center point + corner points help SAM2 capture the full object
386
+ center_x = x + bbox_w / 2.0
387
+ center_y = y + bbox_h / 2.0
388
+
389
+ # Add multiple foreground points: center + corners (helps capture full object)
390
+ point_coords = np.array([
391
+ [center_x, center_y], # Center
392
+ [x + bbox_w * 0.25, y + bbox_h * 0.25], # Top-left quarter
393
+ [x + bbox_w * 0.75, y + bbox_h * 0.25], # Top-right quarter
394
+ [x + bbox_w * 0.25, y + bbox_h * 0.75], # Bottom-left quarter
395
+ [x + bbox_w * 0.75, y + bbox_h * 0.75], # Bottom-right quarter
396
+ ], dtype=np.float32)
397
+ point_labels = np.array([1, 1, 1, 1, 1], dtype=np.int32) # All foreground points
398
+
399
+ # Get multiple masks and select the best one (like CVAT)
400
+ masks, scores, _ = predictor.predict(
401
+ box=box,
402
+ point_coords=point_coords,
403
+ point_labels=point_labels,
404
+ multimask_output=True # Get multiple masks to choose the best fit
405
+ )
406
+
407
+ # Select the best mask using multiple criteria (CVAT-style)
408
+ # Consider both confidence score AND coverage of bbox area
409
+ best_mask_idx = 0
410
+ best_score_combined = 0.0
411
+ bbox_area = bbox_w * bbox_h
412
+
413
+ for idx, (mask, score) in enumerate(zip(masks, scores)):
414
+ # Calculate mask area within bbox region
415
+ mask_binary = mask.astype(np.uint8) * 255
416
+
417
+ # Get mask area in bbox region
418
+ x1_int = max(0, int(x))
419
+ y1_int = max(0, int(y))
420
+ x2_int = min(mask.shape[1], int(x + bbox_w))
421
+ y2_int = min(mask.shape[0], int(y + bbox_h))
422
+
423
+ mask_bbox_region = mask_binary[y1_int:y2_int, x1_int:x2_int]
424
+ mask_area_in_bbox = np.sum(mask_bbox_region > 0)
425
+
426
+ # Calculate coverage ratio (how much of bbox is covered by mask)
427
+ coverage_ratio = mask_area_in_bbox / bbox_area if bbox_area > 0 else 0
428
+
429
+ # Combined score: confidence (60%) + coverage (40%)
430
+ # Higher coverage ensures we capture the full object
431
+ score_combined = float(score) * 0.6 + coverage_ratio * 0.4
432
+
433
+ if score_combined > best_score_combined:
434
+ best_score_combined = score_combined
435
+ best_mask_idx = idx
436
+
437
+ best_mask = masks[best_mask_idx]
438
+ best_score = scores[best_mask_idx]
439
+
440
+ # Post-process mask to fill holes and improve completeness (CVAT-style)
441
+ mask = (best_mask * 255).astype("uint8") if best_mask.dtype == bool else (best_mask * 255).astype("uint8")
442
+
443
+ # Fill small holes in the mask (CVAT-style post-processing)
444
+ # This helps capture parts that might be missing
445
+ mask_filled = cv2.morphologyEx(mask, cv2.MORPH_CLOSE,
446
+ cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5)))
447
+
448
+ # Fill holes using flood fill
449
+ h, w = mask_filled.shape
450
+ mask_floodfill = mask_filled.copy()
451
+ cv2.floodFill(mask_floodfill, None, (0, 0), 255)
452
+ mask_floodfill_inv = cv2.bitwise_not(mask_floodfill)
453
+ mask_filled = cv2.bitwise_or(mask_filled, mask_floodfill_inv)
454
+
455
+ # Use the filled mask for better completeness
456
+ mask = mask_filled
457
+
458
+ # Safely extract confidence score (handle numpy array/scalar)
459
+ score_arr = np.asarray(best_score).flatten()
460
+ confidence = float(score_arr[0])
461
+
462
+ return mask, confidence, (scale_x, scale_y)
463
+
464
+
465
+ def predict_polygon_from_point(image_rgb, point, image_size=None):
466
+ """
467
+ Predict polygon mask using SAM2 with a point click as prompt.
468
+ The point identifies the object to segment.
469
+
470
+ Args:
471
+ image_rgb: numpy array (H, W, 3) in RGB format
472
+ point: dict with keys "x", "y" OR list [x, y] - the clicked point coordinate
473
+ image_size: Optional dict with "width" and "height" for coordinate scaling
474
+
475
+ Returns:
476
+ mask: binary mask (numpy array) - full object shape
477
+ confidence: float confidence score
478
+ scale_factors: tuple (scale_x, scale_y) for coordinate scaling
479
+ """
480
+ predictor = initialize_sam()
481
+ predictor.set_image(image_rgb)
482
+
483
+ # Handle both dict and list formats for point
484
+ if isinstance(point, dict):
485
+ point_x = float(point["x"])
486
+ point_y = float(point["y"])
487
+ else: # list format [x, y]
488
+ point_x, point_y = [float(v) for v in point]
489
+
490
+ # Scale point coordinates if image_size is provided (CVAT-style)
491
+ # image_size represents the display size (like CVAT UI), point is relative to display size
492
+ # We need to scale point FROM display size TO original image size for prediction
493
+ scale_x, scale_y = 1.0, 1.0
494
+ original_h, original_w = image_rgb.shape[:2]
495
+
496
+ if image_size is not None:
497
+ if isinstance(image_size, dict):
498
+ display_w = float(image_size.get("width", original_w))
499
+ display_h = float(image_size.get("height", original_h))
500
+ else:
501
+ display_w, display_h = float(image_size[0]), float(image_size[1])
502
+
503
+ # Calculate scale factors: how much to scale FROM display TO original
504
+ scale_x = original_w / display_w if display_w > 0 else 1.0
505
+ scale_y = original_h / display_h if display_h > 0 else 1.0
506
+
507
+ # Scale point coordinates FROM display size TO original image size
508
+ point_x = point_x * scale_x
509
+ point_y = point_y * scale_y
510
+
511
+ # Prepare point coordinates for SAM2
512
+ # point_coords shape: (1, 2) - single point
513
+ point_coords = np.array([[point_x, point_y]], dtype=np.float32)
514
+ point_labels = np.array([1], dtype=np.int32) # 1 = foreground point
515
+
516
+ # Get multiple masks and select the best one
517
+ masks, scores, _ = predictor.predict(
518
+ point_coords=point_coords,
519
+ point_labels=point_labels,
520
+ multimask_output=True # Get multiple masks to choose the best fit
521
+ )
522
+
523
+ # Select the best mask based on confidence score
524
+ best_mask_idx = np.argmax(scores)
525
+ best_mask = masks[best_mask_idx]
526
+ best_score = scores[best_mask_idx]
527
+
528
+ # Post-process mask to fill holes and improve completeness (CVAT-style)
529
+ mask = (best_mask * 255).astype("uint8") if best_mask.dtype == bool else (best_mask * 255).astype("uint8")
530
+
531
+ # Fill small holes in the mask (CVAT-style post-processing)
532
+ # This helps capture parts that might be missing
533
+ mask_filled = cv2.morphologyEx(mask, cv2.MORPH_CLOSE,
534
+ cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5)))
535
+
536
+ # Fill holes using flood fill
537
+ h, w = mask_filled.shape
538
+ mask_floodfill = mask_filled.copy()
539
+ cv2.floodFill(mask_floodfill, None, (0, 0), 255)
540
+ mask_floodfill_inv = cv2.bitwise_not(mask_floodfill)
541
+ mask_filled = cv2.bitwise_or(mask_filled, mask_floodfill_inv)
542
+
543
+ # Use the filled mask for better completeness
544
+ mask = mask_filled
545
+
546
+ # Safely extract confidence score (handle numpy array/scalar)
547
+ score_arr = np.asarray(best_score).flatten()
548
+ confidence = float(score_arr[0])
549
+
550
+ return mask, confidence, (scale_x, scale_y)
app/utils.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import requests
4
+ from skimage import measure
5
+
6
+
7
+ def load_image_from_url(url: str):
8
+ """
9
+ Load image from URL and return as BGR numpy array.
10
+
11
+ Args:
12
+ url: Image URL string
13
+
14
+ Returns:
15
+ BGR image as numpy array
16
+
17
+ Raises:
18
+ ValueError: If image cannot be decoded
19
+ requests.RequestException: If URL request fails
20
+ """
21
+ response = requests.get(url, timeout=10)
22
+ response.raise_for_status()
23
+ img = cv2.imdecode(
24
+ np.frombuffer(response.content, np.uint8),
25
+ cv2.IMREAD_COLOR
26
+ )
27
+ if img is None:
28
+ raise ValueError(f"Failed to decode image from URL: {url}")
29
+ return img
30
+
31
+
32
+ def mask_to_polygon(mask, scale_factors=(1.0, 1.0)):
33
+ """
34
+ Convert binary mask to polygon coordinates (CVAT-style).
35
+ Uses cv2.findContours and cv2.approxPolyDP like CVAT does.
36
+ Includes post-processing to ensure complete polygon coverage.
37
+
38
+ Args:
39
+ mask: Binary mask (numpy array, uint8, 0 or 255)
40
+ scale_factors: Tuple (scale_x, scale_y) to scale coordinates FROM original TO display size
41
+
42
+ Returns:
43
+ List of coordinates in CVAT format: [x1, y1, x2, y2, x3, y3, ...]
44
+ """
45
+ scale_x, scale_y = scale_factors
46
+
47
+ # Convert mask to binary format for cv2.findContours
48
+ if mask.dtype != np.uint8:
49
+ mask = mask.astype(np.uint8)
50
+
51
+ # Ensure binary mask (0 or 255)
52
+ if mask.max() > 1:
53
+ mask = (mask > 127).astype(np.uint8) * 255
54
+
55
+ # Additional smoothing to ensure complete coverage (CVAT-style)
56
+ # Small morphological closing to connect nearby regions
57
+ kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
58
+ mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=1)
59
+
60
+ # Find contours (CVAT-style)
61
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
62
+ if not contours:
63
+ return []
64
+
65
+ # Get the largest contour by area (most accurate for object shape)
66
+ largest_contour = max(contours, key=cv2.contourArea)
67
+
68
+ # Approximate polygon (CVAT-style, epsilon=1.0)
69
+ # Using epsilon relative to contour perimeter for better accuracy
70
+ epsilon = max(1.0, cv2.arcLength(largest_contour, True) * 0.001) # Adaptive epsilon
71
+ approx_contour = cv2.approxPolyDP(largest_contour, epsilon=epsilon, closed=True)
72
+
73
+ if approx_contour.shape[0] < 3:
74
+ return []
75
+
76
+ # Flatten and convert to list
77
+ polygon = approx_contour.reshape(-1, 2).astype(float)
78
+
79
+ # Scale coordinates FROM original image size TO display size (inverse of bbox scaling)
80
+ # If scale_x > 1, original is larger than display, so we divide
81
+ # If scale_x < 1, original is smaller than display, so we divide (still correct)
82
+ if scale_x != 1.0 or scale_y != 1.0:
83
+ polygon[:, 0] = polygon[:, 0] / scale_x # x coordinates: original -> display
84
+ polygon[:, 1] = polygon[:, 1] / scale_y # y coordinates: original -> display
85
+
86
+ # Flatten to CVAT format: [x1, y1, x2, y2, ...]
87
+ return polygon.flatten().tolist()