peterproofpath
/

sam3

Model card Files Files and versions

xet

Community

peterproofpath commited on Jan 11

Commit

b7720c4

verified ·

1 Parent(s): 1169aff

Update handler.py

Browse files

Files changed (1) hide show

handler.py +134 -289

handler.py CHANGED Viewed

@@ -2,6 +2,9 @@
 SAM 3 Custom Inference Handler for Hugging Face Inference Endpoints
 Model: facebook/sam3
 For ProofPath video assessment - text-prompted segmentation to find UI elements.
 Supports text prompts like "Save button", "dropdown menu", "text input field".
@@ -28,59 +31,31 @@ class EndpointHandler:
     def __init__(self, path: str = ""):
         """
         Initialize SAM 3 model for text-prompted segmentation.
         Args:
             path: Path to the model directory (ignored - we load from HF hub)
         """
-        model_id = "facebook/sam3"
-        # Get HF token for gated model access
-        hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # Import SAM3 components from transformers
-        from transformers import Sam3Processor, Sam3Model
-        self.processor = Sam3Processor.from_pretrained(
-            model_id,
-            token=hf_token,
-        )
-        self.model = Sam3Model.from_pretrained(
-            model_id,
-            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-            token=hf_token,
-        ).to(self.device)
-        self.model.eval()
-        # Also load video model for video segmentation
-        self._video_model = None
-        self._video_processor = None
-    def _get_video_model(self):
-        """Lazy load video model only when needed."""
-        if self._video_model is None:
-            from transformers import Sam3VideoModel, Sam3VideoProcessor
-            model_id = "facebook/sam3"
-            hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
-            self._video_processor = Sam3VideoProcessor.from_pretrained(
-                model_id,
-                token=hf_token,
-            )
-            self._video_model = Sam3VideoModel.from_pretrained(
-                model_id,
-                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-                token=hf_token,
-            ).to(self.device)
-            self._video_model.eval()
-        return self._video_model, self._video_processor
     def _load_image(self, image_data: Any):
         """Load image from various formats."""
@@ -106,7 +81,7 @@ class EndpointHandler:
         else:
             raise ValueError(f"Unsupported image input type: {type(image_data)}")
-    def _load_video_frames(self, video_data: Any, max_frames: int = 100, fps: float = 2.0) -> List:
         """Load video frames from various formats."""
         import cv2
         from PIL import Image
@@ -170,30 +145,12 @@ class EndpointHandler:
                 "video_fps": video_fps
             }
-            return frames, metadata
-        finally:
             if os.path.exists(video_path):
                 os.unlink(video_path)
-    def _masks_to_serializable(self, masks: torch.Tensor) -> List[List[List[int]]]:
-        """Convert binary masks to RLE or simplified format for JSON serialization."""
-        # For efficiency, we'll return bounding box info and optionally compressed masks
-        # Full masks can be very large - return as base64 encoded numpy if needed
-        masks_np = masks.cpu().numpy().astype(np.uint8)
-        # Return as list of base64-encoded masks
-        encoded_masks = []
-        for mask in masks_np:
-            # Encode each mask as PNG for compression
-            from PIL import Image
-            img = Image.fromarray(mask * 255)
-            buffer = io.BytesIO()
-            img.save(buffer, format='PNG')
-            encoded = base64.b64encode(buffer.getvalue()).decode('utf-8')
-            encoded_masks.append(encoded)
-        return encoded_masks
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -206,8 +163,6 @@ class EndpointHandler:
             "inputs": <image_url_or_base64>,
             "parameters": {
                 "prompt": "Save button",
-                "threshold": 0.5,
-                "mask_threshold": 0.5,
                 "return_masks": true
             }
         }
@@ -216,49 +171,26 @@ class EndpointHandler:
         {
             "inputs": <image_url_or_base64>,
             "parameters": {
-                "prompts": ["button", "text field", "dropdown"],
-                "threshold": 0.5
-            }
-        }
-        3. Single image with box prompts (positive/negative):
-        {
-            "inputs": <image_url_or_base64>,
-            "parameters": {
-                "prompt": "handle",
-                "boxes": [[40, 183, 318, 204]],
-                "box_labels": [0],  // 0=negative, 1=positive
-                "threshold": 0.5
             }
         }
-        4. Video with text prompt (track all instances):
         {
             "inputs": <video_url_or_base64>,
             "parameters": {
                 "mode": "video",
                 "prompt": "Submit button",
-                "max_frames": 100,
-                "fps": 2.0
-            }
-        }
-        5. Batch images:
-        {
-            "inputs": [<image1>, <image2>, ...],
-            "parameters": {
-                "prompts": ["ear", "dial"],  // One per image
-                "threshold": 0.5
             }
         }
-        6. ProofPath UI element detection:
         {
             "inputs": <screenshot_base64>,
             "parameters": {
                 "mode": "ui_elements",
-                "elements": ["Save button", "Cancel button", "text input"],
-                "threshold": 0.5
             }
         }
@@ -291,17 +223,13 @@ class EndpointHandler:
             return self._process_video(inputs, params)
         elif mode == "ui_elements":
             return self._process_ui_elements(inputs, params)
-        elif isinstance(inputs, list):
-            return self._process_batch(inputs, params)
         else:
             return self._process_single_image(inputs, params)
     def _process_single_image(self, image_data: Any, params: Dict) -> Dict[str, Any]:
-        """Process a single image with text and/or box prompts."""
         image = self._load_image(image_data)
-        threshold = params.get("threshold", 0.5)
-        mask_threshold = params.get("mask_threshold", 0.5)
         return_masks = params.get("return_masks", True)
         # Get prompts
@@ -311,56 +239,44 @@ class EndpointHandler:
         if not prompts:
             raise ValueError("No text prompt(s) provided")
-        # Get optional box prompts
-        boxes = params.get("boxes")
-        box_labels = params.get("box_labels")
         results = []
         for text_prompt in prompts:
-            # Prepare inputs
-            if boxes is not None:
-                input_boxes = [boxes]
-                input_boxes_labels = [box_labels] if box_labels else [[1] * len(boxes)]
-                processor_inputs = self.processor(
-                    images=image,
-                    text=text_prompt,
-                    input_boxes=input_boxes,
-                    input_boxes_labels=input_boxes_labels,
-                    return_tensors="pt"
-                ).to(self.device)
-            else:
-                processor_inputs = self.processor(
-                    images=image,
-                    text=text_prompt,
-                    return_tensors="pt"
-                ).to(self.device)
-            # Run inference
-            with torch.no_grad():
-                outputs = self.model(**processor_inputs)
-            # Post-process
-            post_results = self.processor.post_process_instance_segmentation(
-                outputs,
-                threshold=threshold,
-                mask_threshold=mask_threshold,
-                target_sizes=processor_inputs.get("original_sizes").tolist()
-            )[0]
             instances = []
-            for i in range(len(post_results.get("boxes", []))):
                 instance = {
-                    "box": post_results["boxes"][i].tolist(),
-                    "score": float(post_results["scores"][i])
                 }
-                if return_masks and "masks" in post_results:
                     # Encode mask as base64 PNG
-                    mask = post_results["masks"][i].cpu().numpy().astype(np.uint8) * 255
                     from PIL import Image as PILImage
-                    mask_img = PILImage.fromarray(mask)
                     buffer = io.BytesIO()
                     mask_img.save(buffer, format='PNG')
                     instance["mask"] = base64.b64encode(buffer.getvalue()).decode('utf-8')
@@ -378,71 +294,6 @@ class EndpointHandler:
             "image_size": {"width": image.width, "height": image.height}
         }
-    def _process_batch(self, images_data: List, params: Dict) -> Dict[str, Any]:
-        """Process multiple images with text prompts."""
-        images = [self._load_image(img) for img in images_data]
-        prompts = params.get("prompts", [])
-        prompt = params.get("prompt")
-        # Handle single prompt for all images
-        if prompt and not prompts:
-            prompts = [prompt] * len(images)
-        if len(prompts) != len(images):
-            raise ValueError(f"Number of prompts ({len(prompts)}) must match number of images ({len(images)})")
-        threshold = params.get("threshold", 0.5)
-        mask_threshold = params.get("mask_threshold", 0.5)
-        return_masks = params.get("return_masks", False)  # Default false for batch
-        # Process batch
-        processor_inputs = self.processor(
-            images=images,
-            text=prompts,
-            return_tensors="pt"
-        ).to(self.device)
-        with torch.no_grad():
-            outputs = self.model(**processor_inputs)
-        # Post-process all results
-        all_results = self.processor.post_process_instance_segmentation(
-            outputs,
-            threshold=threshold,
-            mask_threshold=mask_threshold,
-            target_sizes=processor_inputs.get("original_sizes").tolist()
-        )
-        results = []
-        for idx, (post_results, text_prompt, image) in enumerate(zip(all_results, prompts, images)):
-            instances = []
-            for i in range(len(post_results.get("boxes", []))):
-                instance = {
-                    "box": post_results["boxes"][i].tolist(),
-                    "score": float(post_results["scores"][i])
-                }
-                if return_masks and "masks" in post_results:
-                    mask = post_results["masks"][i].cpu().numpy().astype(np.uint8) * 255
-                    from PIL import Image as PILImage
-                    mask_img = PILImage.fromarray(mask)
-                    buffer = io.BytesIO()
-                    mask_img.save(buffer, format='PNG')
-                    instance["mask"] = base64.b64encode(buffer.getvalue()).decode('utf-8')
-                instances.append(instance)
-            results.append({
-                "image_index": idx,
-                "prompt": text_prompt,
-                "instances": instances,
-                "count": len(instances),
-                "image_size": {"width": image.width, "height": image.height}
-            })
-        return {"results": results}
     def _process_ui_elements(self, image_data: Any, params: Dict) -> Dict[str, Any]:
         """
         ProofPath-specific mode: Detect multiple UI element types in a screenshot.
@@ -455,38 +306,35 @@ class EndpointHandler:
             # Default UI elements to look for
             elements = ["button", "text input", "dropdown", "checkbox", "link"]
-        threshold = params.get("threshold", 0.5)
-        mask_threshold = params.get("mask_threshold", 0.5)
         all_detections = {}
         for element_type in elements:
-            processor_inputs = self.processor(
-                images=image,
-                text=element_type,
-                return_tensors="pt"
-            ).to(self.device)
-            with torch.no_grad():
-                outputs = self.model(**processor_inputs)
-            post_results = self.processor.post_process_instance_segmentation(
-                outputs,
-                threshold=threshold,
-                mask_threshold=mask_threshold,
-                target_sizes=processor_inputs.get("original_sizes").tolist()
-            )[0]
             detections = []
-            for i in range(len(post_results.get("boxes", []))):
-                box = post_results["boxes"][i].tolist()
                 detections.append({
                     "box": box,
-                    "score": float(post_results["scores"][i]),
                     "center": [
                         (box[0] + box[2]) / 2,
                         (box[1] + box[3]) / 2
-                    ]
                 })
             all_detections[element_type] = {
@@ -503,80 +351,78 @@ class EndpointHandler:
     def _process_video(self, video_data: Any, params: Dict) -> Dict[str, Any]:
         """
         Process video with SAM3 Video for text-prompted tracking.
-        Tracks all instances of the prompted concept across frames.
         """
-        video_model, video_processor = self._get_video_model()
         prompt = params.get("prompt")
         if not prompt:
             raise ValueError("Text prompt required for video mode")
         max_frames = params.get("max_frames", 100)
-        fps = params.get("fps", 2.0)
-        # Load video frames
-        frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
-        if not frames:
-            raise ValueError("No frames could be extracted from video")
-        # Initialize video session
-        inference_session = video_processor.init_video_session(
-            video=frames,
-            inference_device=self.device,
-            processing_device="cpu",
-            video_storage_device="cpu",
-            dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-        )
-        # Add text prompt
-        inference_session = video_processor.add_text_prompt(
-            inference_session=inference_session,
-            text=prompt,
-        )
-        # Process all frames
-        outputs_per_frame = {}
-        for model_outputs in video_model.propagate_in_video_iterator(
-            inference_session=inference_session,
-            max_frame_num_to_track=max_frames
-        ):
-            processed = video_processor.postprocess_outputs(inference_session, model_outputs)
-            frame_data = {
-                "frame_idx": model_outputs.frame_idx,
-                "object_ids": processed["object_ids"].tolist() if hasattr(processed["object_ids"], "tolist") else processed["object_ids"],
-                "scores": processed["scores"].tolist() if hasattr(processed["scores"], "tolist") else processed["scores"],
-                "boxes": processed["boxes"].tolist() if hasattr(processed["boxes"], "tolist") else processed["boxes"],
             }
-            outputs_per_frame[model_outputs.frame_idx] = frame_data
-        # Compile tracking results
-        # Group by object_id to show trajectory
-        object_tracks = {}
-        for frame_idx, frame_data in outputs_per_frame.items():
-            for i, obj_id in enumerate(frame_data["object_ids"]):
-                obj_id_str = str(obj_id)
-                if obj_id_str not in object_tracks:
-                    object_tracks[obj_id_str] = {
-                        "object_id": obj_id,
-                        "frames": []
-                    }
-                object_tracks[obj_id_str]["frames"].append({
-                    "frame_idx": frame_idx,
-                    "box": frame_data["boxes"][i] if i < len(frame_data["boxes"]) else None,
-                    "score": frame_data["scores"][i] if i < len(frame_data["scores"]) else None
-                })
-        return {
-            "prompt": prompt,
-            "video_metadata": video_metadata,
-            "frames_processed": len(outputs_per_frame),
-            "objects_tracked": len(object_tracks),
-            "tracks": list(object_tracks.values()),
-            "per_frame_detections": outputs_per_frame
-        }
 # For testing locally
@@ -588,7 +434,6 @@ if __name__ == "__main__":
         "inputs": "http://images.cocodataset.org/val2017/000000077595.jpg",
         "parameters": {
             "prompt": "ear",
-            "threshold": 0.5,
             "return_masks": False
         }
     }
@@ -596,4 +441,4 @@ if __name__ == "__main__":
     result = handler(test_data)
     print(f"Found {result['results'][0]['count']} instances of '{result['results'][0]['prompt']}'")
     for inst in result['results'][0]['instances']:
-        print(f"  Box: {inst['box']}, Score: {inst['score']:.3f}")

 SAM 3 Custom Inference Handler for Hugging Face Inference Endpoints
 Model: facebook/sam3
+Using the official sam3 package from Meta (pip install sam3)
+NOT the transformers integration.
 For ProofPath video assessment - text-prompted segmentation to find UI elements.
 Supports text prompts like "Save button", "dropdown menu", "text input field".
     def __init__(self, path: str = ""):
         """
         Initialize SAM 3 model for text-prompted segmentation.
+        Uses the official sam3 package from Meta.
         Args:
             path: Path to the model directory (ignored - we load from HF hub)
         """
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Import from official sam3 package
+        from sam3.model_builder import build_sam3_image_model
+        from sam3.model.sam3_image_processor import Sam3Processor
+        # Build model - this downloads from HuggingFace automatically
+        # Requires HF_TOKEN for gated model access
+        self.model = build_sam3_image_model()
+        self.processor = Sam3Processor(self.model)
+        # Video model will be loaded lazily
+        self._video_predictor = None
+    def _get_video_predictor(self):
+        """Lazy load video predictor only when needed."""
+        if self._video_predictor is None:
+            from sam3.model_builder import build_sam3_video_predictor
+            self._video_predictor = build_sam3_video_predictor()
+        return self._video_predictor
     def _load_image(self, image_data: Any):
         """Load image from various formats."""
         else:
             raise ValueError(f"Unsupported image input type: {type(image_data)}")
+    def _load_video_frames(self, video_data: Any, max_frames: int = 100, fps: float = 2.0) -> tuple:
         """Load video frames from various formats."""
         import cv2
         from PIL import Image
                 "video_fps": video_fps
             }
+            return video_path, metadata
+        except Exception as e:
             if os.path.exists(video_path):
                 os.unlink(video_path)
+            raise e
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
             "inputs": <image_url_or_base64>,
             "parameters": {
                 "prompt": "Save button",
                 "return_masks": true
             }
         }
         {
             "inputs": <image_url_or_base64>,
             "parameters": {
+                "prompts": ["button", "text field", "dropdown"]
             }
         }
+        3. Video with text prompt (track all instances):
         {
             "inputs": <video_url_or_base64>,
             "parameters": {
                 "mode": "video",
                 "prompt": "Submit button",
+                "max_frames": 100
             }
         }
+        4. ProofPath UI element detection:
         {
             "inputs": <screenshot_base64>,
             "parameters": {
                 "mode": "ui_elements",
+                "elements": ["Save button", "Cancel button", "text input"]
             }
         }
             return self._process_video(inputs, params)
         elif mode == "ui_elements":
             return self._process_ui_elements(inputs, params)
         else:
             return self._process_single_image(inputs, params)
     def _process_single_image(self, image_data: Any, params: Dict) -> Dict[str, Any]:
+        """Process a single image with text prompts using official sam3 API."""
         image = self._load_image(image_data)
         return_masks = params.get("return_masks", True)
         # Get prompts
         if not prompts:
             raise ValueError("No text prompt(s) provided")
+        # Set the image in processor
+        inference_state = self.processor.set_image(image)
         results = []
         for text_prompt in prompts:
+            # Use official sam3 API
+            output = self.processor.set_text_prompt(
+                state=inference_state,
+                prompt=text_prompt
+            )
+            masks = output.get("masks", [])
+            boxes = output.get("boxes", [])
+            scores = output.get("scores", [])
             instances = []
+            # Convert tensors to lists
+            if hasattr(boxes, 'tolist'):
+                boxes = boxes.tolist()
+            if hasattr(scores, 'tolist'):
+                scores = scores.tolist()
+            for i in range(len(boxes)):
                 instance = {
+                    "box": boxes[i] if i < len(boxes) else None,
+                    "score": float(scores[i]) if i < len(scores) else 0.0
                 }
+                if return_masks and masks is not None and i < len(masks):
                     # Encode mask as base64 PNG
+                    mask = masks[i]
+                    if hasattr(mask, 'cpu'):
+                        mask = mask.cpu().numpy()
+                    mask_uint8 = (mask * 255).astype(np.uint8)
                     from PIL import Image as PILImage
+                    mask_img = PILImage.fromarray(mask_uint8)
                     buffer = io.BytesIO()
                     mask_img.save(buffer, format='PNG')
                     instance["mask"] = base64.b64encode(buffer.getvalue()).decode('utf-8')
             "image_size": {"width": image.width, "height": image.height}
         }
     def _process_ui_elements(self, image_data: Any, params: Dict) -> Dict[str, Any]:
         """
         ProofPath-specific mode: Detect multiple UI element types in a screenshot.
             # Default UI elements to look for
             elements = ["button", "text input", "dropdown", "checkbox", "link"]
+        # Set the image once
+        inference_state = self.processor.set_image(image)
         all_detections = {}
         for element_type in elements:
+            output = self.processor.set_text_prompt(
+                state=inference_state,
+                prompt=element_type
+            )
+            boxes = output.get("boxes", [])
+            scores = output.get("scores", [])
+            if hasattr(boxes, 'tolist'):
+                boxes = boxes.tolist()
+            if hasattr(scores, 'tolist'):
+                scores = scores.tolist()
             detections = []
+            for i in range(len(boxes)):
+                box = boxes[i]
                 detections.append({
                     "box": box,
+                    "score": float(scores[i]) if i < len(scores) else 0.0,
                     "center": [
                         (box[0] + box[2]) / 2,
                         (box[1] + box[3]) / 2
+                    ] if len(box) >= 4 else None
                 })
             all_detections[element_type] = {
     def _process_video(self, video_data: Any, params: Dict) -> Dict[str, Any]:
         """
         Process video with SAM3 Video for text-prompted tracking.
+        Uses the official sam3 video predictor API.
         """
+        video_predictor = self._get_video_predictor()
         prompt = params.get("prompt")
         if not prompt:
             raise ValueError("Text prompt required for video mode")
         max_frames = params.get("max_frames", 100)
+        # Load video to temp path
+        video_path, video_metadata = self._load_video_frames(video_data, max_frames)
+        try:
+            # Start video session
+            response = video_predictor.handle_request(
+                request=dict(
+                    type="start_session",
+                    resource_path=video_path,
+                )
+            )
+            session_id = response.get("session_id")
+            # Add text prompt at frame 0
+            response = video_predictor.handle_request(
+                request=dict(
+                    type="add_prompt",
+                    session_id=session_id,
+                    frame_index=0,
+                    text=prompt,
+                )
+            )
+            output = response.get("outputs", {})
+            # Get tracked objects
+            object_ids = output.get("object_ids", [])
+            if hasattr(object_ids, 'tolist'):
+                object_ids = object_ids.tolist()
+            # Propagate through video
+            propagate_response = video_predictor.handle_request(
+                request=dict(
+                    type="propagate",
+                    session_id=session_id,
+                )
+            )
+            # Collect results per frame
+            per_frame_results = propagate_response.get("per_frame_outputs", {})
+            # Convert to serializable format
+            tracks = []
+            for obj_id in object_ids:
+                track = {
+                    "object_id": int(obj_id) if hasattr(obj_id, 'item') else obj_id,
+                    "frames": []
+                }
+                tracks.append(track)
+            return {
+                "prompt": prompt,
+                "video_metadata": video_metadata,
+                "objects_tracked": len(object_ids),
+                "tracks": tracks,
+                "session_id": session_id
             }
+        finally:
+            # Clean up temp file
+            if os.path.exists(video_path):
+                os.unlink(video_path)
 # For testing locally
         "inputs": "http://images.cocodataset.org/val2017/000000077595.jpg",
         "parameters": {
             "prompt": "ear",
             "return_masks": False
         }
     }
     result = handler(test_data)
     print(f"Found {result['results'][0]['count']} instances of '{result['results'][0]['prompt']}'")
     for inst in result['results'][0]['instances']:
+        print(f"  Box: {inst['box']}, Score: {inst['score']:.3f}")