Justin331
/

sam3

@@ -50,10 +50,19 @@ class EndpointHandler:
         # Build SAM3 video predictor
         # This automatically downloads model from facebook/sam3 on HuggingFace
         try:
-            self.predictor = build_sam3_video_predictor(gpus_to_use=[0])
             print("[INIT] SAM3 video predictor loaded successfully")
         except Exception as e:
             print(f"[INIT] Error loading SAM3 predictor: {e}")
             raise
         # Initialize HuggingFace API for uploads (if available)
@@ -152,9 +161,20 @@ class EndpointHandler:
                 for frame_idx, frame_output in outputs_per_frame.items():
                     self._save_frame_masks(frame_output, masks_dir, frame_idx)
-                    # Collect object IDs
                     if "object_ids" in frame_output and frame_output["object_ids"] is not None:
-                        all_object_ids.update(frame_output["object_ids"])
                 print(f"[STEP 5] Saved masks for {len(outputs_per_frame)} frames")
@@ -212,6 +232,57 @@ class EndpointHandler:
                 "error_type": type(e).__name__
             }
     def _prepare_video(self, video_data: str, tmpdir: Path) -> Path:
         """Decode base64 video and save to file."""
         try:
@@ -235,10 +306,23 @@ class EndpointHandler:
         masks = frame_output["masks"]
         object_ids = frame_output.get("object_ids", [])
-        # Convert to numpy if tensor
         if torch.is_tensor(masks):
             masks = masks.cpu().numpy()
         # Save each object's mask
         for i, obj_id in enumerate(object_ids):
             if i < len(masks):

         # Build SAM3 video predictor
         # This automatically downloads model from facebook/sam3 on HuggingFace
         try:
+            # Ensure BPE tokenizer file exists
+            bpe_path = self._ensure_bpe_file()
+            # Build predictor with explicit bpe_path
+            self.predictor = build_sam3_video_predictor(
+                gpus_to_use=[0],
+                bpe_path=bpe_path
+            )
             print("[INIT] SAM3 video predictor loaded successfully")
         except Exception as e:
             print(f"[INIT] Error loading SAM3 predictor: {e}")
+            import traceback
+            traceback.print_exc()
             raise
         # Initialize HuggingFace API for uploads (if available)
                 for frame_idx, frame_output in outputs_per_frame.items():
                     self._save_frame_masks(frame_output, masks_dir, frame_idx)
+                    # Collect object IDs - handle tensors properly
                     if "object_ids" in frame_output and frame_output["object_ids"] is not None:
+                        obj_ids = frame_output["object_ids"]
+                        # Convert to list if tensor or numpy array
+                        if torch.is_tensor(obj_ids):
+                            obj_ids = obj_ids.cpu().tolist()
+                        elif isinstance(obj_ids, np.ndarray):
+                            obj_ids = obj_ids.tolist()
+                        # Add to set (handles single int or list)
+                        if isinstance(obj_ids, list):
+                            all_object_ids.update(obj_ids)
+                        else:
+                            all_object_ids.add(obj_ids)
                 print(f"[STEP 5] Saved masks for {len(outputs_per_frame)} frames")
                 "error_type": type(e).__name__
             }
+    def _ensure_bpe_file(self) -> str:
+        """
+        Ensure BPE tokenizer file exists. Download from HuggingFace if missing.
+        Returns path to the BPE file.
+        """
+        # Default expected path
+        assets_dir = Path("/repository/assets")
+        bpe_file = assets_dir / "bpe_simple_vocab_16e6.txt.gz"
+        if bpe_file.exists():
+            print(f"[INIT] BPE file found at: {bpe_file}")
+            return str(bpe_file)
+        print(f"[INIT] BPE file not found, downloading from HuggingFace...")
+        # Create assets directory
+        assets_dir.mkdir(parents=True, exist_ok=True)
+        # Download BPE file from facebook/sam3 repo
+        try:
+            from huggingface_hub import hf_hub_download
+            downloaded_path = hf_hub_download(
+                repo_id="facebook/sam3",
+                filename="assets/bpe_simple_vocab_16e6.txt.gz",
+                local_dir="/repository",
+                local_dir_use_symlinks=False
+            )
+            print(f"[INIT] BPE file downloaded to: {downloaded_path}")
+            return downloaded_path
+        except Exception as e:
+            print(f"[INIT] Error downloading BPE file: {e}")
+            print(f"[INIT] Trying alternative download method...")
+            # Fallback: download directly from raw URL
+            import urllib.request
+            url = "https://huggingface.co/facebook/sam3/resolve/main/assets/bpe_simple_vocab_16e6.txt.gz"
+            try:
+                urllib.request.urlretrieve(url, str(bpe_file))
+                print(f"[INIT] BPE file downloaded to: {bpe_file}")
+                return str(bpe_file)
+            except Exception as e2:
+                print(f"[INIT] Fallback download also failed: {e2}")
+                raise ValueError(
+                    f"Could not download BPE tokenizer file. Please add assets/bpe_simple_vocab_16e6.txt.gz "
+                    f"to your repository. Download from: {url}"
+                )
     def _prepare_video(self, video_data: str, tmpdir: Path) -> Path:
         """Decode base64 video and save to file."""
         try:
         masks = frame_output["masks"]
         object_ids = frame_output.get("object_ids", [])
+        # Handle different types of object_ids
+        if torch.is_tensor(object_ids):
+            object_ids = object_ids.cpu().tolist()
+        elif isinstance(object_ids, np.ndarray):
+            object_ids = object_ids.tolist()
+        elif not isinstance(object_ids, list):
+            object_ids = list(object_ids) if object_ids is not None else []
+        # Convert masks to numpy if tensor
         if torch.is_tensor(masks):
             masks = masks.cpu().numpy()
+        # Ensure masks is 3D array [num_objects, height, width]
+        if len(masks.shape) == 4:
+            # Remove batch dimension if present
+            masks = masks[0]
         # Save each object's mask
         for i, obj_id in enumerate(object_ids):
             if i < len(masks):