Spaces:

chris-propeller
/

sam3-test

Paused

App Files Files Community

chris-propeller commited on Nov 27, 2025

Commit

acd640e

1 Parent(s): b597179

combine points/boxes/text

Browse files

Files changed (3) hide show

.gitignore +2 -0
app-bak.py +0 -342
app.py +115 -101

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv
2	+ __pycache__

app-bak.py DELETED Viewed

@@ -1,342 +0,0 @@
-import spaces
-import gradio as gr
-import numpy as np
-from PIL import Image
-import base64
-import io
-from typing import Dict, Any
-import warnings
-warnings.filterwarnings("ignore")
-@spaces.GPU
-def sam3_inference(image, text_prompt, confidence_threshold=0.5):
-    """
-    Standalone GPU function with model initialization for Spaces Stateless GPU
-    All CUDA operations and related imports must happen inside this decorated function
-    """
-    try:
-        # Import torch and transformers inside GPU function to avoid main process CUDA init
-        import torch
-        from transformers import Sam3Model, Sam3Processor
-        # Initialize model and processor inside GPU function (required for Stateless GPU)
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        model = Sam3Model.from_pretrained(
-            "facebook/sam3",
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
-        ).to(device)
-        processor = Sam3Processor.from_pretrained("facebook/sam3")
-        print(f"Model loaded on device: {device}")
-        # Handle base64 input (for API)
-        if isinstance(image, str):
-            if image.startswith('data:image'):
-                image = image.split(',')[1]
-            image_bytes = base64.b64decode(image)
-            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        # Process with SAM3
-        inputs = processor(
-            images=image,
-            text=text_prompt.strip(),
-            return_tensors="pt"
-        ).to(device)
-        # Convert dtype to match model
-        for key in inputs:
-            if inputs[key].dtype == torch.float32:
-                inputs[key] = inputs[key].to(model.dtype)
-        with torch.no_grad():
-            outputs = model(**inputs)
-        # Use proper SAM3 post-processing
-        results = processor.post_process_instance_segmentation(
-            outputs,
-            threshold=confidence_threshold,
-            mask_threshold=0.5,
-            target_sizes=inputs.get("original_sizes").tolist()
-        )[0]
-        return results
-    except Exception as e:
-        raise Exception(f"SAM3 inference error: {str(e)}")
-class SAM3Handler:
-    """SAM3 handler for both UI and API access"""
-    def __init__(self):
-        print("SAM3 handler initialized (models will be loaded lazily)")
-    def predict(self, image, text_prompt, confidence_threshold=0.5):
-        """
-        Main prediction function for both UI and API
-        Args:
-            image: PIL Image or base64 string
-            text_prompt: String describing what to segment
-            confidence_threshold: Minimum confidence for masks
-        Returns:
-            Dict with masks, scores, and metadata
-        """
-        try:
-            # Call the standalone GPU function
-            results = sam3_inference(image, text_prompt, confidence_threshold)
-            # Prepare response
-            response = {
-                "masks": [],
-                "scores": [],
-                "prompt_type": "text",
-                "prompt_value": text_prompt,
-                "num_masks": len(results["masks"])
-            }
-            # Process each mask
-            for i in range(len(results["masks"])):
-                mask_np = results["masks"][i].cpu().numpy().astype(np.uint8) * 255
-                score = results["scores"][i].item()
-                if score >= confidence_threshold:
-                    # Convert mask to base64 for API response
-                    mask_image = Image.fromarray(mask_np, mode='L')
-                    buffer = io.BytesIO()
-                    mask_image.save(buffer, format='PNG')
-                    mask_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
-                    response["masks"].append(mask_b64)
-                    response["scores"].append(score)
-            return response
-        except Exception as e:
-            return {"error": str(e)}
-# Initialize the handler
-handler = SAM3Handler()
-def gradio_interface(image, text_prompt, confidence_threshold):
-    """Gradio interface wrapper"""
-    result = handler.predict(image, text_prompt, confidence_threshold)
-    if "error" in result:
-        return f"Error: {result['error']}", None
-    # For UI, show the first mask as an example
-    if result["masks"]:
-        first_mask_b64 = result["masks"][0]
-        first_score = result["scores"][0]
-        # Decode first mask for display
-        mask_bytes = base64.b64decode(first_mask_b64)
-        mask_image = Image.open(io.BytesIO(mask_bytes))
-        info = f"Found {result['num_masks']} masks. First mask score: {first_score:.3f}"
-        return info, mask_image
-    else:
-        return "No masks found above confidence threshold", None
-def api_predict(data: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    API function matching SAM2 inference endpoint format
-    Expected input format (matching SAM2 + SAM3 extensions):
-    {
-        "inputs": {
-            "image": "base64_encoded_image_string",
-            # SAM3 NEW: Text-based prompts
-            "text_prompts": ["person", "car"],  # List of text descriptions
-            # SAM2 compatible: Point-based prompts
-            "points": [[[x1, y1]], [[x2, y2]]],  # Points for each object
-            "labels": [[1], [1]],  # Labels for each point (1=foreground, 0=background)
-            # SAM2 compatible: Bounding box prompts
-            "boxes": [[x1, y1, x2, y2], [x1, y1, x2, y2]],  # Bounding boxes
-            "multimask_output": false,  # Optional, defaults to False
-            "confidence_threshold": 0.5  # Optional, minimum confidence for returned masks
-        }
-    }
-    Returns (matching SAM2 format):
-    {
-        "masks": [base64_encoded_mask_1, base64_encoded_mask_2, ...],
-        "scores": [score1, score2, ...],
-        "num_objects": int,
-        "sam_version": "3.0",
-        "success": true
-    }
-    """
-    try:
-        inputs_data = data.get("inputs", {})
-        # Extract inputs
-        image_b64 = inputs_data.get("image")
-        text_prompts = inputs_data.get("text_prompts", [])
-        input_points = inputs_data.get("points", [])
-        input_labels = inputs_data.get("labels", [])
-        input_boxes = inputs_data.get("boxes", [])
-        multimask_output = inputs_data.get("multimask_output", False)
-        confidence_threshold = inputs_data.get("confidence_threshold", 0.5)
-        # Validate inputs
-        if not image_b64:
-            return {"error": "No image provided", "success": False}
-        has_text = bool(text_prompts)
-        has_points = bool(input_points and input_labels)
-        has_boxes = bool(input_boxes)
-        if not (has_text or has_points or has_boxes):
-            return {"error": "Must provide at least one prompt type: text_prompts, points+labels, or boxes", "success": False}
-        if has_points and len(input_points) != len(input_labels):
-            return {"error": "Number of points and labels must match", "success": False}
-        # Decode image
-        if image_b64.startswith('data:image'):
-            image_b64 = image_b64.split(',')[1]
-        image_bytes = base64.b64decode(image_b64)
-        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        all_masks = []
-        all_scores = []
-        # Process text prompts (SAM3 feature)
-        if has_text:
-            for text_prompt in text_prompts:
-                result = handler.predict(image, text_prompt, confidence_threshold)
-                if "error" not in result:
-                    all_masks.extend(result["masks"])
-                    all_scores.extend(result["scores"])
-        # Process visual prompts (SAM2 compatibility) - Basic implementation
-        # Note: This is a simplified version. Full SAM2 compatibility would require
-        # implementing the visual prompt processing in the handler
-        if has_boxes or has_points:
-            # For now, fall back to a generic prompt if no text provided
-            if not has_text:
-                result = handler.predict(image, "object", confidence_threshold)
-                if "error" not in result and result["masks"]:
-                    # Take only the number of masks requested
-                    num_requested = len(input_boxes) if has_boxes else len(input_points)
-                    all_masks.extend(result["masks"][:num_requested])
-                    all_scores.extend(result["scores"][:num_requested])
-        # Build SAM2-compatible response
-        return {
-            "masks": all_masks,
-            "scores": all_scores,
-            "num_objects": len(all_masks),
-            "sam_version": "3.0",
-            "success": True
-        }
-    except Exception as e:
-        return {"error": str(e), "success": False, "sam_version": "3.0"}
-# Create Gradio interface
-with gr.Blocks(title="SAM3 Inference API") as demo:
-    gr.HTML("<h1>SAM3 Promptable Concept Segmentation</h1>")
-    gr.HTML("<p>This Space provides both a UI and API for SAM3 inference. Use the interface below or call the API programmatically.</p>")
-    with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(type="pil", label="Input Image")
-            text_input = gr.Textbox(label="Text Prompt", placeholder="Enter what you want to segment (e.g., 'cat', 'person', 'car')")
-            confidence_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.5, step=0.1, label="Confidence Threshold")
-            predict_btn = gr.Button("Segment", variant="primary")
-        with gr.Column():
-            info_output = gr.Textbox(label="Results Info")
-            mask_output = gr.Image(label="Sample Mask")
-    # API endpoint - this creates /api/predict/
-    predict_btn.click(
-        gradio_interface,
-        inputs=[image_input, text_input, confidence_slider],
-        outputs=[info_output, mask_output],
-        api_name="predict"  # This creates the API endpoint
-    )
-    # SAM2-compatible API endpoint - this creates /api/sam2_compatible/
-    gr.Interface(
-        fn=api_predict,
-        inputs=gr.JSON(label="SAM2/SAM3 Compatible Input"),
-        outputs=gr.JSON(label="SAM2/SAM3 Compatible Output"),
-        title="SAM2/SAM3 Compatible API",
-        description="API endpoint that matches SAM2 inference endpoint format with SAM3 extensions",
-        api_name="sam2_compatible"
-    )
-    # Add API documentation
-    gr.HTML("""
-    <h2>API Usage</h2>
-    <h3>1. Simple Text API (Gradio format)</h3>
-    <pre>
-import requests
-import base64
-# Encode your image to base64
-with open("image.jpg", "rb") as f:
-    image_b64 = base64.b64encode(f.read()).decode()
-# Make API request
-response = requests.post(
-    "https://your-username-sam3-api.hf.space/api/predict",
-    json={
-        "data": [image_b64, "kitten", 0.5]
-    }
-)
-result = response.json()
-    </pre>
-    <h3>2. SAM2/SAM3 Compatible API (Inference Endpoint format)</h3>
-    <pre>
-import requests
-import base64
-# Encode your image to base64
-with open("image.jpg", "rb") as f:
-    image_b64 = base64.b64encode(f.read()).decode()
-# SAM3 Text Prompts (NEW)
-response = requests.post(
-    "https://your-username-sam3-api.hf.space/api/sam2_compatible",
-    json={
-        "data": [{
-            "inputs": {
-                "image": image_b64,
-                "text_prompts": ["kitten", "toy"],
-                "confidence_threshold": 0.5
-            }
-        }]
-    }
-)
-# SAM2 Compatible (Points/Boxes)
-response = requests.post(
-    "https://your-username-sam3-api.hf.space/api/sam2_compatible",
-    json={
-        "data": [{
-            "inputs": {
-                "image": image_b64,
-                "boxes": [[100, 100, 200, 200]],
-                "confidence_threshold": 0.5
-            }
-        }]
-    }
-)
-result = response.json()
-    </pre>
-    """)
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

app.py CHANGED Viewed

@@ -2,10 +2,10 @@ import spaces
 import gradio as gr
 @spaces.GPU
-def sam3_inference(image, text_prompt=None, boxes=None, box_labels=None, confidence_threshold=0.5):
     """
     Core SAM3 inference function for Stateless GPU environment
-    Supports both text prompts and box prompts
     Returns raw results for both UI and API use
     """
     # Import everything inside the GPU function
@@ -18,12 +18,15 @@ def sam3_inference(image, text_prompt=None, boxes=None, box_labels=None, confide
     try:
         # Validate inputs
-        if not text_prompt and not boxes:
-            raise ValueError("Either text_prompt or boxes must be provided")
         if boxes and not box_labels:
             raise ValueError("box_labels must be provided when boxes are specified")
         # Handle base64 input if needed
         if isinstance(image, str):
             if image.startswith('data:image'):
@@ -59,14 +62,36 @@ def sam3_inference(image, text_prompt=None, boxes=None, box_labels=None, confide
             for i, box in enumerate(boxes):
                 if len(box) == 4:  # [x1, y1, x2, y2]
                     formatted_boxes.append(box)
-                    # Use corresponding label or default to positive (1)
-                    label = box_labels[i] if i < len(box_labels) else 1
-                    formatted_labels.append(label)
             if formatted_boxes:
                 processor_kwargs["input_boxes"] = [formatted_boxes]
                 processor_kwargs["input_boxes_labels"] = [formatted_labels]
         # Process input
         inputs = processor(**processor_kwargs).to(device)
@@ -248,96 +273,67 @@ def sam2_compatible_api(data):
         all_polygons = []
         prompt_types = []
-        # Process text prompts (SAM3 feature)
         if has_text:
             prompt_types.append("text")
-            for text_prompt in text_prompts:
-                results = sam3_inference(image, text_prompt=text_prompt, confidence_threshold=confidence_threshold)
-                if results and len(results["masks"]) > 0:
-                    for i in range(len(results["masks"])):
-                        mask_np = results["masks"][i].cpu().numpy().astype(np.uint8) * 255
-                        score = results["scores"][i].item()
-                        if score >= confidence_threshold:
-                            # Convert mask to base64
-                            mask_image = Image.fromarray(mask_np, mode='L')
-                            buffer = io.BytesIO()
-                            mask_image.save(buffer, format='PNG')
-                            mask_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
-                            all_masks.append(mask_b64)
-                            all_scores.append(score)
-                            # Extract polygons if vectorize is enabled
-                            if vectorize:
-                                binary_mask = (mask_np > 0).astype(np.uint8)
-                                polygons = _mask_to_polygons_original_size(binary_mask, simplify_epsilon)
-                                all_polygons.append(polygons)
-        # Process visual prompts (SAM2 compatibility) - Now properly implemented
-        if has_boxes:
             prompt_types.append("visual")
             # Create box labels (default to positive boxes if not provided)
-            box_labels = inputs_data.get("box_labels", [1] * len(input_boxes))
-            # Process boxes using SAM3's native box support
-            results = sam3_inference(
-                image=image,
-                text_prompt=None,
-                boxes=input_boxes,
-                box_labels=box_labels,
-                confidence_threshold=confidence_threshold
-            )
-            if results and len(results["masks"]) > 0:
-                for i in range(len(results["masks"])):
-                    mask_np = results["masks"][i].cpu().numpy().astype(np.uint8) * 255
-                    score = results["scores"][i].item()
-                    if score >= confidence_threshold:
-                        # Convert mask to base64
-                        mask_image = Image.fromarray(mask_np, mode='L')
-                        buffer = io.BytesIO()
-                        mask_image.save(buffer, format='PNG')
-                        mask_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
-                        all_masks.append(mask_b64)
-                        all_scores.append(score)
-                        # Extract polygons if vectorize is enabled
-                        if vectorize:
-                            binary_mask = (mask_np > 0).astype(np.uint8)
-                            polygons = _mask_to_polygons_original_size(binary_mask, simplify_epsilon)
-                            all_polygons.append(polygons)
-        # Process point prompts (SAM2 compatibility) - Fallback implementation
-        elif has_points and not has_text:
-            prompt_types.append("visual")
-            # For point prompts, use a generic prompt to get masks (SAM3 doesn't natively support points)
-            # This is a fallback - true SAM2 compatibility would require point prompt support
-            results = sam3_inference(image, text_prompt="object", confidence_threshold=confidence_threshold)
-            if results and len(results["masks"]) > 0:
-                # Take only the number of masks requested
-                num_requested = len(input_points)
-                for i in range(min(num_requested, len(results["masks"]))):
-                    mask_np = results["masks"][i].cpu().numpy().astype(np.uint8) * 255
-                    score = results["scores"][i].item()
-                    if score >= confidence_threshold:
-                        # Convert mask to base64
-                        mask_image = Image.fromarray(mask_np, mode='L')
-                        buffer = io.BytesIO()
-                        mask_image.save(buffer, format='PNG')
-                        mask_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
-                        all_masks.append(mask_b64)
-                        all_scores.append(score)
-                        # Extract polygons if vectorize is enabled
-                        if vectorize:
-                            binary_mask = (mask_np > 0).astype(np.uint8)
-                            polygons = _mask_to_polygons_original_size(binary_mask, simplify_epsilon)
-                            all_polygons.append(polygons)
         # Build SAM2-compatible response
         response = {
@@ -451,7 +447,7 @@ import base64
 with open("image.jpg", "rb") as f:
     image_b64 = base64.b64encode(f.read()).decode()
-# SAM3 Text Prompts (NEW)
 response = requests.post(
     "https://your-username-sam3-api.hf.space/api/sam2_compatible",
     json={
@@ -463,13 +459,30 @@ response = requests.post(
     }
 )
-# SAM2 Compatible (Points/Boxes)
 response = requests.post(
     "https://your-username-sam3-api.hf.space/api/sam2_compatible",
     json={
         "inputs": {
             "image": image_b64,
             "boxes": [[100, 100, 200, 200]],
             "confidence_threshold": 0.5
         }
     }
@@ -499,15 +512,16 @@ result = response.json()
   "inputs": {
     "image": "base64_encoded_image_string",
-    // SAM3 NEW: Text-based prompts
     "text_prompts": ["person", "car"],  // List of text descriptions
-    // SAM2 COMPATIBLE: Point-based prompts
-    "points": [[[x1, y1]], [[x2, y2]]],  // Points for each object
-    "labels": [[1], [1]],  // Labels for each point (1=foreground, 0=background)
-    // SAM2 COMPATIBLE: Bounding box prompts
-    "boxes": [[x1, y1, x2, y2], [x1, y1, x2, y2]],  // Bounding boxes
     "multimask_output": false,  // Optional, defaults to False
     "confidence_threshold": 0.5,  // Optional, minimum confidence for returned masks

 import gradio as gr
 @spaces.GPU
+def sam3_inference(image, text_prompt=None, boxes=None, box_labels=None, points=None, point_labels=None, confidence_threshold=0.5):
     """
     Core SAM3 inference function for Stateless GPU environment
+    Supports text prompts, box prompts, and point prompts (individually or combined)
     Returns raw results for both UI and API use
     """
     # Import everything inside the GPU function
     try:
         # Validate inputs
+        if not text_prompt and not boxes and not points:
+            raise ValueError("At least one of text_prompt, boxes, or points must be provided")
         if boxes and not box_labels:
             raise ValueError("box_labels must be provided when boxes are specified")
+        if points and not point_labels:
+            raise ValueError("point_labels must be provided when points are specified")
         # Handle base64 input if needed
         if isinstance(image, str):
             if image.startswith('data:image'):
             for i, box in enumerate(boxes):
                 if len(box) == 4:  # [x1, y1, x2, y2]
                     formatted_boxes.append(box)
+                    # Use the provided label (supports both positive=1 and negative=0)
+                    if i < len(box_labels):
+                        formatted_labels.append(box_labels[i])
+                    else:
+                        raise ValueError(f"Missing label for box {i}")
             if formatted_boxes:
                 processor_kwargs["input_boxes"] = [formatted_boxes]
                 processor_kwargs["input_boxes_labels"] = [formatted_labels]
+        # Add point prompts if provided
+        if points and point_labels:
+            # Convert points to expected format: [[[x1, y1], [x2, y2]], ...]
+            # SAM3 expects points as nested lists for batch processing
+            formatted_points = []
+            formatted_point_labels = []
+            for i, point in enumerate(points):
+                if len(point) == 2:  # [x, y]
+                    formatted_points.append(point)
+                    # Use the provided label (supports both positive=1 and negative=0)
+                    if i < len(point_labels):
+                        formatted_point_labels.append(point_labels[i])
+                    else:
+                        raise ValueError(f"Missing label for point {i}")
+            if formatted_points:
+                processor_kwargs["input_points"] = [formatted_points]
+                processor_kwargs["input_points_labels"] = [formatted_point_labels]
         # Process input
         inputs = processor(**processor_kwargs).to(device)
         all_polygons = []
         prompt_types = []
+        # Determine what prompt types are being used
         if has_text:
             prompt_types.append("text")
+        if has_points or has_boxes:
             prompt_types.append("visual")
+        # Prepare inputs for combined SAM3 inference call
+        combined_text_prompt = None
+        combined_boxes = None
+        combined_box_labels = None
+        combined_points = None
+        combined_point_labels = None
+        # Handle text prompts - combine multiple text prompts into one
+        if has_text:
+            # For multiple text prompts, join them (SAM3 can handle combined descriptions)
+            combined_text_prompt = ", ".join(text_prompts)
+        # Handle box prompts
+        if has_boxes:
+            combined_boxes = input_boxes
             # Create box labels (default to positive boxes if not provided)
+            combined_box_labels = inputs_data.get("box_labels", [1] * len(input_boxes))
+        # Handle point prompts
+        if has_points:
+            combined_points = input_points
+            combined_point_labels = input_labels
+        # Make single combined inference call with all prompt types
+        results = sam3_inference(
+            image=image,
+            text_prompt=combined_text_prompt,
+            boxes=combined_boxes,
+            box_labels=combined_box_labels,
+            points=combined_points,
+            point_labels=combined_point_labels,
+            confidence_threshold=confidence_threshold
+        )
+        # Process results
+        if results and len(results["masks"]) > 0:
+            for i in range(len(results["masks"])):
+                mask_np = results["masks"][i].cpu().numpy().astype(np.uint8) * 255
+                score = results["scores"][i].item()
+                if score >= confidence_threshold:
+                    # Convert mask to base64
+                    mask_image = Image.fromarray(mask_np, mode='L')
+                    buffer = io.BytesIO()
+                    mask_image.save(buffer, format='PNG')
+                    mask_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+                    all_masks.append(mask_b64)
+                    all_scores.append(score)
+                    # Extract polygons if vectorize is enabled
+                    if vectorize:
+                        binary_mask = (mask_np > 0).astype(np.uint8)
+                        polygons = _mask_to_polygons_original_size(binary_mask, simplify_epsilon)
+                        all_polygons.append(polygons)
         # Build SAM2-compatible response
         response = {
 with open("image.jpg", "rb") as f:
     image_b64 = base64.b64encode(f.read()).decode()
+# SAM3 Text Prompts Only
 response = requests.post(
     "https://your-username-sam3-api.hf.space/api/sam2_compatible",
     json={
     }
 )
+# SAM2 Compatible (Points/Boxes Only)
 response = requests.post(
     "https://your-username-sam3-api.hf.space/api/sam2_compatible",
     json={
         "inputs": {
             "image": image_b64,
             "boxes": [[100, 100, 200, 200]],
+            "box_labels": [1],  # 1=positive, 0=negative
+            "confidence_threshold": 0.5
+        }
+    }
+)
+# SAM3 Combined Prompts (Text + Visual) - NEW CAPABILITY!
+response = requests.post(
+    "https://your-username-sam3-api.hf.space/api/sam2_compatible",
+    json={
+        "inputs": {
+            "image": image_b64,
+            "text_prompts": ["cat"],  # Text description
+            "boxes": [[50, 50, 150, 150]],  # Bounding box
+            "box_labels": [0],  # 0=negative (exclude this area)
+            "points": [[200, 200]],  # Point prompt
+            "labels": [1],  # 1=positive point
             "confidence_threshold": 0.5
         }
     }
   "inputs": {
     "image": "base64_encoded_image_string",
+    // SAM3 NEW: Text-based prompts (can be combined with visual prompts)
     "text_prompts": ["person", "car"],  // List of text descriptions
+    // SAM2 COMPATIBLE: Point-based prompts (can be combined with text/boxes)
+    "points": [[x1, y1], [x2, y2]],  // Individual points (not nested arrays)
+    "labels": [1, 0],  // Labels for each point (1=positive/foreground, 0=negative/background)
+    // SAM2 COMPATIBLE: Bounding box prompts (can be combined with text/points)
+    "boxes": [[x1, y1, x2, y2], [x3, y3, x4, y4]],  // Bounding boxes
+    "box_labels": [1, 0],  // Labels for each box (1=positive, 0=negative/exclude)
     "multimask_output": false,  // Optional, defaults to False
     "confidence_threshold": 0.5,  // Optional, minimum confidence for returned masks