Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

handler.py +144 -194
requirements.txt +1 -1
setup.py +8 -0

handler.py CHANGED Viewed

@@ -12,8 +12,8 @@ import numpy as np
 from PIL import Image
 import cv2
-# Transformers imports for SAM3
-from transformers import Sam3VideoModel, Sam3VideoProcessor
 # HuggingFace Hub for uploads
 try:
@@ -28,19 +28,17 @@ class EndpointHandler:
     SAM3 Video Segmentation Handler for HuggingFace Inference Endpoints
     Processes video with text prompts and returns segmentation masks.
-    Uses transformers library for clean integration with HuggingFace models.
     """
     def __init__(self, path: str = ""):
         """
-        Initialize SAM3 video model using transformers.
         Args:
-            path: Path to model repository (contains model files)
-                  For HF Inference Endpoints, this is /repository
-                  Contains: sam3.pt, config.json, processor_config.json, etc.
         """
-        print(f"[INIT] Initializing SAM3 video model from {path}")
         # Set device
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -49,36 +47,14 @@ class EndpointHandler:
         print(f"[INIT] Using device: {self.device}")
-        # Load model and processor from the repository
-        # If path is empty or ".", try to load from default model ID
-        model_path = path if path and path != "." else "facebook/sam3"
         try:
-            print(f"[INIT] Loading model from: {model_path}")
-            self.model = Sam3VideoModel.from_pretrained(
-                model_path,
-                torch_dtype=torch.bfloat16,
-                device_map=self.device
-            )
-            self.processor = Sam3VideoProcessor.from_pretrained(model_path)
-            print("[INIT] SAM3 video model loaded successfully")
         except Exception as e:
-            print(f"[INIT] Error loading from {model_path}: {e}")
-            print("[INIT] Falling back to facebook/sam3")
-            # Fallback to public model
-            self.model = Sam3VideoModel.from_pretrained(
-                "facebook/sam3",
-                torch_dtype=torch.bfloat16,
-                device_map=self.device
-            )
-            self.processor = Sam3VideoProcessor.from_pretrained("facebook/sam3")
-            print("[INIT] SAM3 video model loaded from facebook/sam3")
         # Initialize HuggingFace API for uploads (if available)
         self.hf_api = None
@@ -91,7 +67,7 @@ class EndpointHandler:
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
-        Process video segmentation request using transformers API.
         Expected input format:
         {
@@ -134,46 +110,53 @@ class EndpointHandler:
                 video_path = self._prepare_video(video_data, tmpdir_path)
                 print(f"[STEP 1] Video prepared at: {video_path}")
-                # Step 2: Load video frames
-                video_frames = self._load_video_frames(video_path)
-                print(f"[STEP 2] Loaded {len(video_frames)} frames")
-                # Step 3: Initialize inference session
-                inference_session = self.processor.init_video_session(
-                    video=video_frames,
-                    inference_device=self.device,
-                    processing_device="cpu",
-                    video_storage_device="cpu",
-                    dtype=torch.bfloat16,
                 )
-                print(f"[STEP 3] Inference session initialized")
-                # Step 4: Add text prompt
-                inference_session = self.processor.add_text_prompt(
-                    inference_session=inference_session,
-                    text=text_prompt,
                 )
-                print(f"[STEP 4] Text prompt added")
-                # Step 5: Propagate through video and save masks
                 masks_dir = tmpdir_path / "masks"
                 masks_dir.mkdir()
-                frame_outputs = self._propagate_and_save_masks(
-                    inference_session,
-                    masks_dir
-                )
-                print(f"[STEP 5] Propagated through {len(frame_outputs)} frames")
-                # Get unique object IDs across all frames
                 all_object_ids = set()
-                for frame_output in frame_outputs.values():
-                    if 'object_ids' in frame_output and frame_output['object_ids'] is not None:
-                        ids = frame_output['object_ids']
-                        if torch.is_tensor(ids):
-                            all_object_ids.update(ids.tolist())
-                        else:
-                            all_object_ids.update(ids)
                 # Step 6: Create ZIP archive
                 zip_path = tmpdir_path / "masks.zip"
@@ -181,34 +164,45 @@ class EndpointHandler:
                 zip_size_mb = zip_path.stat().st_size / 1e6
                 print(f"[STEP 6] Created ZIP archive: {zip_size_mb:.2f} MB")
-                # Step 7: Prepare response based on return_format
                 response = {
-                    "frame_count": len(frame_outputs),
                     "objects_detected": sorted(list(all_object_ids)) if all_object_ids else [],
                     "compressed_size_mb": round(zip_size_mb, 2),
-                    "video_metadata": self._get_video_metadata_from_frames(video_frames)
                 }
                 if return_format == "download_url" and output_repo:
                     # Upload to HuggingFace
                     download_url = self._upload_to_hf(zip_path, output_repo)
                     response["download_url"] = download_url
-                    print(f"[STEP 7] Uploaded to HuggingFace: {download_url}")
                 elif return_format == "base64":
                     # Return base64 encoded ZIP
                     with open(zip_path, "rb") as f:
-                        zip_base64 = base64.b64encode(f.read()).decode('utf-8')
-                    response["masks_zip_base64"] = zip_base64
-                    print(f"[STEP 7] Returning base64 encoded ZIP")
                 else:
-                    # metadata_only - just return stats
-                    response["note"] = "Masks generated but not returned. Use return_format='base64' or 'download_url' to get masks."
-                    print(f"[STEP 7] Returning metadata only")
                 return response
         except Exception as e:
             print(f"[ERROR] {type(e).__name__}: {str(e)}")
             import traceback
@@ -218,134 +212,90 @@ class EndpointHandler:
                 "error_type": type(e).__name__
             }
-    def _prepare_video(self, video_data: Any, tmpdir: Path) -> Path:
-        """Decode base64 video data and save to temporary location."""
-        video_path = tmpdir / "input_video.mp4"
-        if isinstance(video_data, str):
-            # Base64 encoded
             video_bytes = base64.b64decode(video_data)
-        elif isinstance(video_data, bytes):
-            video_bytes = video_data
-        else:
-            raise ValueError(f"Unsupported video data type: {type(video_data)}")
         video_path.write_bytes(video_bytes)
-        return video_path
-    def _load_video_frames(self, video_path: Path) -> list:
-        """Load video frames from MP4 file."""
-        from transformers.video_utils import load_video
-        # load_video returns (frames, audio) - we only need frames
-        frames, _ = load_video(str(video_path))
-        return frames
-    def _propagate_and_save_masks(self, inference_session, masks_dir: Path) -> Dict[int, Dict]:
         """
-        Propagate masks through video using transformers API and save to disk.
-        Returns dict mapping frame_idx -> outputs
         """
-        outputs_per_frame = {}
-        # Use the model's propagate_in_video_iterator
-        for model_outputs in self.model.propagate_in_video_iterator(
-            inference_session=inference_session,
-            max_frame_num_to_track=None  # Process all frames
-        ):
-            frame_idx = model_outputs.frame_idx
-            # Post-process outputs
-            processed_outputs = self.processor.postprocess_outputs(
-                inference_session,
-                model_outputs
-            )
-            outputs_per_frame[frame_idx] = processed_outputs
-            # Save masks for this frame
-            self._save_frame_masks(processed_outputs, masks_dir, frame_idx)
-        return outputs_per_frame
-    def _save_frame_masks(self, outputs: Dict, masks_dir: Path, frame_idx: int):
-        """
-        Save masks for a single frame.
-        Saves combined binary mask with all objects.
-        Format: mask_NNNN.png (white = object, black = background)
-        """
-        # Extract masks from outputs
-        if 'masks' not in outputs or outputs['masks'] is None or len(outputs['masks']) == 0:
-            # No objects detected - save empty mask
-            # Get dimensions from inference session or use default
-            height = 1080
-            width = 1920
-            combined_mask = np.zeros((height, width), dtype=np.uint8)
-        else:
-            masks = outputs['masks']  # Tensor of shape (num_objects, H, W)
-            # Convert to numpy if needed
-            if torch.is_tensor(masks):
-                masks = masks.cpu().numpy()
-            # Combine all object masks into single binary mask
-            if len(masks.shape) == 3:
-                # Multiple objects - combine with logical OR
-                combined_mask = np.any(masks > 0.5, axis=0).astype(np.uint8) * 255
-            elif len(masks.shape) == 2:
-                # Single object
-                combined_mask = (masks > 0.5).astype(np.uint8) * 255
-            else:
-                # Unexpected shape - save empty
-                combined_mask = np.zeros((1080, 1920), dtype=np.uint8)
-        # Save as PNG
-        mask_filename = masks_dir / f"mask_{frame_idx:04d}.png"
-        mask_image = Image.fromarray(combined_mask)
-        mask_image.save(mask_filename, compress_level=9)
     def _create_zip(self, masks_dir: Path, zip_path: Path):
         """Create ZIP archive of all mask PNGs."""
-        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
-            for mask_file in sorted(masks_dir.glob("mask_*.png")):
                 zipf.write(mask_file, mask_file.name)
-    def _upload_to_hf(self, zip_path: Path, output_repo: str) -> str:
-        """
-        Upload ZIP to HuggingFace dataset repository.
-        Returns: Download URL
-        """
-        if not self.hf_api:
-            raise RuntimeError("HuggingFace Hub API not available. Set HF_TOKEN environment variable.")
-        # Upload file to dataset repo
-        path_in_repo = f"masks/{zip_path.name}"
-        self.hf_api.upload_file(
-            path_or_fileobj=str(zip_path),
-            path_in_repo=path_in_repo,
-            repo_id=output_repo,
-            repo_type="dataset",
-        )
-        # Construct download URL
-        download_url = f"https://huggingface.co/datasets/{output_repo}/resolve/main/{path_in_repo}"
-        return download_url
-    def _get_video_metadata_from_frames(self, frames: list) -> Dict:
-        """Extract metadata from loaded video frames."""
-        if not frames or len(frames) == 0:
             return {}
-        # Frames are numpy arrays of shape (H, W, C)
-        first_frame = frames[0]
-        return {
-            "frame_count": len(frames),
-            "height": first_frame.shape[0],
-            "width": first_frame.shape[1],
-            "channels": first_frame.shape[2] if len(first_frame.shape) > 2 else 1,
-        }

 from PIL import Image
 import cv2
+# SAM3 imports - using local sam3 package in repository
+from sam3.model_builder import build_sam3_video_predictor
 # HuggingFace Hub for uploads
 try:
     SAM3 Video Segmentation Handler for HuggingFace Inference Endpoints
     Processes video with text prompts and returns segmentation masks.
+    Uses SAM3 repository code directly from local sam3/ package.
     """
     def __init__(self, path: str = ""):
         """
+        Initialize SAM3 video predictor.
         Args:
+            path: Path to model repository (not used - model loads from HF automatically)
         """
+        print(f"[INIT] Initializing SAM3 video predictor")
         # Set device
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"[INIT] Using device: {self.device}")
+        # Build SAM3 video predictor
+        # This automatically downloads model from facebook/sam3 on HuggingFace
         try:
+            self.predictor = build_sam3_video_predictor(gpus_to_use=[0])
+            print("[INIT] SAM3 video predictor loaded successfully")
         except Exception as e:
+            print(f"[INIT] Error loading SAM3 predictor: {e}")
+            raise
         # Initialize HuggingFace API for uploads (if available)
         self.hf_api = None
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
+        Process video segmentation request using SAM3 video predictor API.
         Expected input format:
         {
                 video_path = self._prepare_video(video_data, tmpdir_path)
                 print(f"[STEP 1] Video prepared at: {video_path}")
+                # Step 2: Start SAM3 session
+                response = self.predictor.handle_request(
+                    request=dict(
+                        type="start_session",
+                        resource_path=str(video_path),
+                    )
                 )
+                session_id = response["session_id"]
+                print(f"[STEP 2] Session started: {session_id}")
+                # Step 3: Add text prompt
+                response = self.predictor.handle_request(
+                    request=dict(
+                        type="add_prompt",
+                        session_id=session_id,
+                        frame_index=0,  # Add prompt on first frame
+                        text=text_prompt,
+                    )
                 )
+                print(f"[STEP 3] Text prompt added")
+                # Step 4: Propagate through video and collect outputs
+                outputs_per_frame = {}
+                for stream_response in self.predictor.handle_stream_request(
+                    request=dict(
+                        type="propagate_in_video",
+                        session_id=session_id,
+                    )
+                ):
+                    frame_idx = stream_response["frame_index"]
+                    outputs_per_frame[frame_idx] = stream_response["outputs"]
+                print(f"[STEP 4] Propagated through {len(outputs_per_frame)} frames")
+                # Step 5: Save masks to PNG files
                 masks_dir = tmpdir_path / "masks"
                 masks_dir.mkdir()
                 all_object_ids = set()
+                for frame_idx, frame_output in outputs_per_frame.items():
+                    self._save_frame_masks(frame_output, masks_dir, frame_idx)
+                    # Collect object IDs
+                    if "object_ids" in frame_output and frame_output["object_ids"] is not None:
+                        all_object_ids.update(frame_output["object_ids"])
+                print(f"[STEP 5] Saved masks for {len(outputs_per_frame)} frames")
                 # Step 6: Create ZIP archive
                 zip_path = tmpdir_path / "masks.zip"
                 zip_size_mb = zip_path.stat().st_size / 1e6
                 print(f"[STEP 6] Created ZIP archive: {zip_size_mb:.2f} MB")
+                # Step 7: Get video metadata
+                video_metadata = self._get_video_metadata(video_path)
+                # Step 8: Prepare response based on return_format
                 response = {
+                    "frame_count": len(outputs_per_frame),
                     "objects_detected": sorted(list(all_object_ids)) if all_object_ids else [],
                     "compressed_size_mb": round(zip_size_mb, 2),
+                    "video_metadata": video_metadata
                 }
                 if return_format == "download_url" and output_repo:
                     # Upload to HuggingFace
                     download_url = self._upload_to_hf(zip_path, output_repo)
                     response["download_url"] = download_url
+                    print(f"[STEP 8] Uploaded to HuggingFace: {download_url}")
                 elif return_format == "base64":
                     # Return base64 encoded ZIP
                     with open(zip_path, "rb") as f:
+                        zip_bytes = f.read()
+                    response["masks_zip_base64"] = base64.b64encode(zip_bytes).decode("utf-8")
+                    print(f"[STEP 8] Encoded ZIP to base64")
                 else:
+                    # metadata_only - just return the stats
+                    print(f"[STEP 8] Returning metadata only")
+                # Step 9: Close session
+                self.predictor.handle_request(
+                    request=dict(
+                        type="close_session",
+                        session_id=session_id,
+                    )
+                )
+                print(f"[STEP 9] Session closed")
                 return response
         except Exception as e:
             print(f"[ERROR] {type(e).__name__}: {str(e)}")
             import traceback
                 "error_type": type(e).__name__
             }
+    def _prepare_video(self, video_data: str, tmpdir: Path) -> Path:
+        """Decode base64 video and save to file."""
+        try:
             video_bytes = base64.b64decode(video_data)
+        except Exception as e:
+            raise ValueError(f"Failed to decode base64 video: {e}")
+        video_path = tmpdir / "input_video.mp4"
         video_path.write_bytes(video_bytes)
+        return video_path
+    def _save_frame_masks(self, frame_output: Dict, masks_dir: Path, frame_idx: int):
         """
+        Save masks for a frame as PNG files.
+        Each object gets its own mask file: frame_XXXX_obj_Y.png
         """
+        if "masks" not in frame_output or frame_output["masks"] is None:
+            return
+        masks = frame_output["masks"]
+        object_ids = frame_output.get("object_ids", [])
+        # Convert to numpy if tensor
+        if torch.is_tensor(masks):
+            masks = masks.cpu().numpy()
+        # Save each object's mask
+        for i, obj_id in enumerate(object_ids):
+            if i < len(masks):
+                mask = masks[i]
+                # Convert to binary (0 or 255)
+                mask_binary = (mask > 0.5).astype(np.uint8) * 255
+                # Save as PNG
+                mask_img = Image.fromarray(mask_binary)
+                mask_filename = f"frame_{frame_idx:05d}_obj_{obj_id}.png"
+                mask_img.save(masks_dir / mask_filename, compress_level=9)
     def _create_zip(self, masks_dir: Path, zip_path: Path):
         """Create ZIP archive of all mask PNGs."""
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=9) as zipf:
+            for mask_file in sorted(masks_dir.glob("*.png")):
                 zipf.write(mask_file, mask_file.name)
+    def _get_video_metadata(self, video_path: Path) -> Dict[str, Any]:
+        """Extract video metadata using OpenCV."""
+        try:
+            cap = cv2.VideoCapture(str(video_path))
+            metadata = {
+                "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
+                "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
+                "fps": float(cap.get(cv2.CAP_PROP_FPS)),
+                "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
+            }
+            cap.release()
+            return metadata
+        except Exception as e:
+            print(f"[WARNING] Could not extract video metadata: {e}")
             return {}
+    def _upload_to_hf(self, zip_path: Path, repo_id: str) -> str:
+        """Upload ZIP file to HuggingFace dataset repository."""
+        if not self.hf_api:
+            raise ValueError("HuggingFace Hub API not initialized. Set HF_TOKEN environment variable.")
+        try:
+            # Generate unique filename
+            import time
+            timestamp = int(time.time())
+            filename = f"masks_{timestamp}.zip"
+            # Upload file
+            url = self.hf_api.upload_file(
+                path_or_fileobj=str(zip_path),
+                path_in_repo=filename,
+                repo_id=repo_id,
+                repo_type="dataset",
+            )
+            # Return download URL
+            download_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{filename}"
+            return download_url
+        except Exception as e:
+            raise ValueError(f"Failed to upload to HuggingFace: {e}")

requirements.txt CHANGED Viewed

	@@ -1 +1 @@
1	- ~~git+https://github~~.~~com/huggingface/transformers.git~~


1	+ .

setup.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from setuptools import setup, find_packages
+setup(
+    name="sam3",
+    version="0.1.0",
+    packages=find_packages(),
+    description="A local package for the SAM3 model and utilities.",
+)