Justin331 commited on
Commit
42376ab
·
verified ·
1 Parent(s): d918120

Upload handler.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. handler.py +344 -115
handler.py CHANGED
@@ -3,6 +3,9 @@ import io
3
  import base64
4
  import tempfile
5
  import zipfile
 
 
 
6
  from typing import Dict, Any, Optional
7
  from pathlib import Path
8
  import json
@@ -12,6 +15,15 @@ import numpy as np
12
  from PIL import Image
13
  import cv2
14
 
 
 
 
 
 
 
 
 
 
15
  # SAM3 imports - using local sam3 package in repository
16
  from sam3.model_builder import build_sam3_video_predictor
17
 
@@ -38,52 +50,81 @@ class EndpointHandler:
38
  Args:
39
  path: Path to model repository (not used - model loads from HF automatically)
40
  """
41
- print(f"[INIT] Initializing SAM3 video predictor")
 
 
42
 
43
  # Set device
44
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
45
  if self.device != "cuda":
 
46
  raise ValueError("SAM3 requires GPU acceleration. No CUDA device found.")
47
 
48
- print(f"[INIT] Using device: {self.device}")
 
 
 
 
49
 
50
  # Build SAM3 video predictor
51
- # This automatically downloads model from facebook/sam3 on HuggingFace
52
  try:
 
 
 
53
  # Ensure BPE tokenizer file exists
54
  bpe_path = self._ensure_bpe_file()
 
55
 
56
  # Build predictor with explicit bpe_path
57
  self.predictor = build_sam3_video_predictor(
58
  gpus_to_use=[0],
59
  bpe_path=bpe_path
60
  )
61
- print("[INIT] SAM3 video predictor loaded successfully")
 
 
 
62
  except Exception as e:
63
- print(f"[INIT] Error loading SAM3 predictor: {e}")
64
- import traceback
65
- traceback.print_exc()
66
  raise
67
 
68
  # Initialize HuggingFace API for uploads (if available)
69
  self.hf_api = None
70
  hf_token = os.getenv("HF_TOKEN")
 
71
  if HF_HUB_AVAILABLE and hf_token:
72
- self.hf_api = HfApi(token=hf_token)
73
- print("[INIT] HuggingFace Hub API initialized")
 
 
 
74
  else:
75
- print("[INIT] HuggingFace Hub uploads disabled (no token or huggingface_hub not installed)")
 
 
 
 
 
 
 
 
 
76
 
77
  def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
78
  """
79
  Process video segmentation request using SAM3 video predictor API.
80
 
81
- Expected input format:
82
  {
83
- "video": <base64_encoded_video>,
84
- "text_prompt": "object to segment",
85
- "return_format": "download_url" or "base64" or "metadata_only" # optional
86
- "output_repo": "username/dataset-name", # optional, for HF upload
 
 
87
  }
88
 
89
  Returns:
@@ -95,99 +136,223 @@ class EndpointHandler:
95
  "objects_detected": [1, 2, 3] # object IDs
96
  }
97
  """
 
 
 
 
 
 
 
98
  try:
99
- # Extract parameters
100
- video_data = data.get("video")
101
- text_prompt = data.get("text_prompt", data.get("inputs", ""))
 
 
102
  output_repo = data.get("output_repo")
103
  return_format = data.get("return_format", "metadata_only")
104
 
 
 
 
 
 
 
 
105
  if not video_data:
106
- return {"error": "No video data provided. Include 'video' in request."}
 
107
 
108
  if not text_prompt:
109
- return {"error": "No text prompt provided. Include 'text_prompt' or 'inputs' in request."}
 
 
 
 
 
 
 
 
 
110
 
111
- print(f"[REQUEST] Processing video with prompt: '{text_prompt}'")
112
- print(f"[REQUEST] Return format: {return_format}")
113
 
114
  # Process video in temporary directory
115
  with tempfile.TemporaryDirectory() as tmpdir:
116
  tmpdir_path = Path(tmpdir)
 
117
 
118
- # Step 1: Decode and save video
119
- video_path = self._prepare_video(video_data, tmpdir_path)
120
- print(f"[STEP 1] Video prepared at: {video_path}")
 
121
 
122
- # Step 2: Start SAM3 session
123
- response = self.predictor.handle_request(
124
- request=dict(
125
- type="start_session",
126
- resource_path=str(video_path),
127
- )
128
- )
129
- session_id = response["session_id"]
130
- print(f"[STEP 2] Session started: {session_id}")
 
 
131
 
132
- # Step 3: Add text prompt
133
- response = self.predictor.handle_request(
134
- request=dict(
135
- type="add_prompt",
136
- session_id=session_id,
137
- frame_index=0, # Add prompt on first frame
138
- text=text_prompt,
 
 
 
 
139
  )
140
- )
141
- print(f"[STEP 3] Text prompt added")
 
 
 
 
 
 
 
 
 
 
 
142
 
143
- # Step 4: Propagate through video and collect outputs
144
- outputs_per_frame = {}
145
- for stream_response in self.predictor.handle_stream_request(
146
- request=dict(
147
- type="propagate_in_video",
148
- session_id=session_id,
 
 
149
  )
150
- ):
151
- frame_idx = stream_response["frame_index"]
152
- outputs_per_frame[frame_idx] = stream_response["outputs"]
 
 
 
 
 
153
 
154
- print(f"[STEP 4] Propagated through {len(outputs_per_frame)} frames")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
- # Step 5: Save masks to PNG files
157
- masks_dir = tmpdir_path / "masks"
158
- masks_dir.mkdir()
 
159
 
160
- all_object_ids = set()
161
- for frame_idx, frame_output in outputs_per_frame.items():
162
- self._save_frame_masks(frame_output, masks_dir, frame_idx)
163
 
164
- # Collect object IDs - handle tensors properly
165
- if "object_ids" in frame_output and frame_output["object_ids"] is not None:
166
- obj_ids = frame_output["object_ids"]
167
- # Convert to list if tensor or numpy array
168
- if torch.is_tensor(obj_ids):
169
- obj_ids = obj_ids.cpu().tolist()
170
- elif isinstance(obj_ids, np.ndarray):
171
- obj_ids = obj_ids.tolist()
172
 
173
- # Add to set (handles single int or list)
174
- if isinstance(obj_ids, list):
175
- all_object_ids.update(obj_ids)
176
- else:
177
- all_object_ids.add(obj_ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
- print(f"[STEP 5] Saved masks for {len(outputs_per_frame)} frames")
 
 
 
180
 
181
- # Step 6: Create ZIP archive
182
- zip_path = tmpdir_path / "masks.zip"
183
- self._create_zip(masks_dir, zip_path)
184
- zip_size_mb = zip_path.stat().st_size / 1e6
185
- print(f"[STEP 6] Created ZIP archive: {zip_size_mb:.2f} MB")
 
 
 
 
 
 
 
 
 
186
 
187
- # Step 7: Get video metadata
188
- video_metadata = self._get_video_metadata(video_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
- # Step 8: Prepare response based on return_format
191
  response = {
192
  "frame_count": len(outputs_per_frame),
193
  "objects_detected": sorted(list(all_object_ids)) if all_object_ids else [],
@@ -196,37 +361,74 @@ class EndpointHandler:
196
  }
197
 
198
  if return_format == "download_url" and output_repo:
199
- # Upload to HuggingFace
200
- download_url = self._upload_to_hf(zip_path, output_repo)
201
- response["download_url"] = download_url
202
- print(f"[STEP 8] Uploaded to HuggingFace: {download_url}")
 
 
 
 
203
 
204
  elif return_format == "base64":
205
- # Return base64 encoded ZIP
206
- with open(zip_path, "rb") as f:
207
- zip_bytes = f.read()
208
- response["masks_zip_base64"] = base64.b64encode(zip_bytes).decode("utf-8")
209
- print(f"[STEP 8] Encoded ZIP to base64")
 
 
 
 
210
 
211
  else:
212
- # metadata_only - just return the stats
213
- print(f"[STEP 8] Returning metadata only")
214
 
215
- # Step 9: Close session
216
- self.predictor.handle_request(
217
- request=dict(
218
- type="close_session",
219
- session_id=session_id,
 
 
 
 
 
 
 
 
220
  )
221
- )
222
- print(f"[STEP 9] Session closed")
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  return response
225
 
226
  except Exception as e:
227
- print(f"[ERROR] {type(e).__name__}: {str(e)}")
228
- import traceback
229
- traceback.print_exc()
 
 
 
 
 
 
 
 
 
230
  return {
231
  "error": str(e),
232
  "error_type": type(e).__name__
@@ -237,23 +439,27 @@ class EndpointHandler:
237
  Ensure BPE tokenizer file exists. Download from HuggingFace if missing.
238
  Returns path to the BPE file.
239
  """
 
 
240
  # Default expected path
241
  assets_dir = Path("/repository/assets")
242
  bpe_file = assets_dir / "bpe_simple_vocab_16e6.txt.gz"
243
 
244
  if bpe_file.exists():
245
- print(f"[INIT] BPE file found at: {bpe_file}")
246
  return str(bpe_file)
247
 
248
- print(f"[INIT] BPE file not found, downloading from HuggingFace...")
 
249
 
250
  # Create assets directory
251
  assets_dir.mkdir(parents=True, exist_ok=True)
252
 
253
- # Download BPE file from facebook/sam3 repo
254
  try:
255
  from huggingface_hub import hf_hub_download
256
 
 
257
  downloaded_path = hf_hub_download(
258
  repo_id="facebook/sam3",
259
  filename="assets/bpe_simple_vocab_16e6.txt.gz",
@@ -261,23 +467,25 @@ class EndpointHandler:
261
  local_dir_use_symlinks=False
262
  )
263
 
264
- print(f"[INIT] BPE file downloaded to: {downloaded_path}")
265
  return downloaded_path
266
 
267
  except Exception as e:
268
- print(f"[INIT] Error downloading BPE file: {e}")
269
- print(f"[INIT] Trying alternative download method...")
270
 
271
  # Fallback: download directly from raw URL
272
  import urllib.request
273
  url = "https://huggingface.co/facebook/sam3/resolve/main/assets/bpe_simple_vocab_16e6.txt.gz"
274
 
275
  try:
 
276
  urllib.request.urlretrieve(url, str(bpe_file))
277
- print(f"[INIT] BPE file downloaded to: {bpe_file}")
278
  return str(bpe_file)
 
279
  except Exception as e2:
280
- print(f"[INIT] Fallback download also failed: {e2}")
281
  raise ValueError(
282
  f"Could not download BPE tokenizer file. Please add assets/bpe_simple_vocab_16e6.txt.gz "
283
  f"to your repository. Download from: {url}"
@@ -286,8 +494,12 @@ class EndpointHandler:
286
  def _prepare_video(self, video_data: str, tmpdir: Path) -> Path:
287
  """Decode base64 video and save to file."""
288
  try:
 
289
  video_bytes = base64.b64decode(video_data)
 
 
290
  except Exception as e:
 
291
  raise ValueError(f"Failed to decode base64 video: {e}")
292
 
293
  video_path = tmpdir / "input_video.mp4"
@@ -295,13 +507,14 @@ class EndpointHandler:
295
 
296
  return video_path
297
 
298
- def _save_frame_masks(self, frame_output: Dict, masks_dir: Path, frame_idx: int):
299
  """
300
  Save masks for a frame as PNG files.
301
  Each object gets its own mask file: frame_XXXX_obj_Y.png
 
302
  """
303
  if "masks" not in frame_output or frame_output["masks"] is None:
304
- return
305
 
306
  masks = frame_output["masks"]
307
  object_ids = frame_output.get("object_ids", [])
@@ -320,10 +533,10 @@ class EndpointHandler:
320
 
321
  # Ensure masks is 3D array [num_objects, height, width]
322
  if len(masks.shape) == 4:
323
- # Remove batch dimension if present
324
  masks = masks[0]
325
 
326
  # Save each object's mask
 
327
  for i, obj_id in enumerate(object_ids):
328
  if i < len(masks):
329
  mask = masks[i]
@@ -335,17 +548,28 @@ class EndpointHandler:
335
  mask_img = Image.fromarray(mask_binary)
336
  mask_filename = f"frame_{frame_idx:05d}_obj_{obj_id}.png"
337
  mask_img.save(masks_dir / mask_filename, compress_level=9)
 
 
 
338
 
339
  def _create_zip(self, masks_dir: Path, zip_path: Path):
340
  """Create ZIP archive of all mask PNGs."""
 
 
 
341
  with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=9) as zipf:
342
- for mask_file in sorted(masks_dir.glob("*.png")):
343
  zipf.write(mask_file, mask_file.name)
344
 
345
  def _get_video_metadata(self, video_path: Path) -> Dict[str, Any]:
346
  """Extract video metadata using OpenCV."""
347
  try:
348
  cap = cv2.VideoCapture(str(video_path))
 
 
 
 
 
349
  metadata = {
350
  "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
351
  "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
@@ -353,9 +577,11 @@ class EndpointHandler:
353
  "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
354
  }
355
  cap.release()
 
356
  return metadata
 
357
  except Exception as e:
358
- print(f"[WARNING] Could not extract video metadata: {e}")
359
  return {}
360
 
361
  def _upload_to_hf(self, zip_path: Path, repo_id: str) -> str:
@@ -369,6 +595,8 @@ class EndpointHandler:
369
  timestamp = int(time.time())
370
  filename = f"masks_{timestamp}.zip"
371
 
 
 
372
  # Upload file
373
  url = self.hf_api.upload_file(
374
  path_or_fileobj=str(zip_path),
@@ -382,4 +610,5 @@ class EndpointHandler:
382
  return download_url
383
 
384
  except Exception as e:
 
385
  raise ValueError(f"Failed to upload to HuggingFace: {e}")
 
3
  import base64
4
  import tempfile
5
  import zipfile
6
+ import logging
7
+ import sys
8
+ import time
9
  from typing import Dict, Any, Optional
10
  from pathlib import Path
11
  import json
 
15
  from PIL import Image
16
  import cv2
17
 
18
+ # Configure logging
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(asctime)s [%(levelname)s] %(message)s',
22
+ datefmt='%Y-%m-%d %H:%M:%S',
23
+ stream=sys.stdout
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
  # SAM3 imports - using local sam3 package in repository
28
  from sam3.model_builder import build_sam3_video_predictor
29
 
 
50
  Args:
51
  path: Path to model repository (not used - model loads from HF automatically)
52
  """
53
+ logger.info("="*80)
54
+ logger.info("INITIALIZING SAM3 VIDEO SEGMENTATION HANDLER")
55
+ logger.info("="*80)
56
 
57
  # Set device
58
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
59
+ logger.info(f"Device detection: {self.device}")
60
+
61
  if self.device != "cuda":
62
+ logger.error("FATAL: SAM3 requires GPU acceleration. No CUDA device found.")
63
  raise ValueError("SAM3 requires GPU acceleration. No CUDA device found.")
64
 
65
+ # Log GPU information
66
+ if torch.cuda.is_available():
67
+ logger.info(f"GPU Device: {torch.cuda.get_device_name(0)}")
68
+ logger.info(f"CUDA Version: {torch.version.cuda}")
69
+ logger.info(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
70
 
71
  # Build SAM3 video predictor
 
72
  try:
73
+ logger.info("Building SAM3 video predictor...")
74
+ start_time = time.time()
75
+
76
  # Ensure BPE tokenizer file exists
77
  bpe_path = self._ensure_bpe_file()
78
+ logger.info(f"BPE tokenizer path: {bpe_path}")
79
 
80
  # Build predictor with explicit bpe_path
81
  self.predictor = build_sam3_video_predictor(
82
  gpus_to_use=[0],
83
  bpe_path=bpe_path
84
  )
85
+
86
+ elapsed = time.time() - start_time
87
+ logger.info(f"✓ SAM3 video predictor loaded successfully in {elapsed:.2f}s")
88
+
89
  except Exception as e:
90
+ logger.error(f" Failed to load SAM3 predictor: {type(e).__name__}: {e}")
91
+ logger.exception("Full traceback:")
 
92
  raise
93
 
94
  # Initialize HuggingFace API for uploads (if available)
95
  self.hf_api = None
96
  hf_token = os.getenv("HF_TOKEN")
97
+
98
  if HF_HUB_AVAILABLE and hf_token:
99
+ try:
100
+ self.hf_api = HfApi(token=hf_token)
101
+ logger.info("✓ HuggingFace Hub API initialized")
102
+ except Exception as e:
103
+ logger.warning(f"Failed to initialize HF API: {e}")
104
  else:
105
+ reasons = []
106
+ if not HF_HUB_AVAILABLE:
107
+ reasons.append("huggingface_hub not installed")
108
+ if not hf_token:
109
+ reasons.append("HF_TOKEN not set")
110
+ logger.info(f"HuggingFace Hub uploads disabled ({', '.join(reasons)})")
111
+
112
+ logger.info("="*80)
113
+ logger.info("INITIALIZATION COMPLETE - READY FOR REQUESTS")
114
+ logger.info("="*80)
115
 
116
  def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
117
  """
118
  Process video segmentation request using SAM3 video predictor API.
119
 
120
+ Expected input format (HuggingFace Inference Toolkit standard):
121
  {
122
+ "inputs": <base64_encoded_video>,
123
+ "parameters": {
124
+ "text_prompt": "object to segment",
125
+ "return_format": "download_url" or "base64" or "metadata_only", # optional
126
+ "output_repo": "username/dataset-name", # optional, for HF upload
127
+ }
128
  }
129
 
130
  Returns:
 
136
  "objects_detected": [1, 2, 3] # object IDs
137
  }
138
  """
139
+ request_start = time.time()
140
+
141
+ logger.info("")
142
+ logger.info("="*80)
143
+ logger.info("NEW REQUEST RECEIVED")
144
+ logger.info("="*80)
145
+
146
  try:
147
+ # Extract and validate parameters
148
+ logger.info("Parsing request parameters...")
149
+
150
+ video_data = data.get("inputs") # Video comes from "inputs" (HF toolkit standard)
151
+ text_prompt = data.get("text_prompt", "")
152
  output_repo = data.get("output_repo")
153
  return_format = data.get("return_format", "metadata_only")
154
 
155
+ # Log request details
156
+ logger.info(f" text_prompt: '{text_prompt}'")
157
+ logger.info(f" return_format: {return_format}")
158
+ logger.info(f" output_repo: {output_repo if output_repo else 'None'}")
159
+ logger.info(f" video_data: {'Present' if video_data else 'Missing'} ({len(video_data) if video_data else 0} chars)")
160
+
161
+ # Validate inputs
162
  if not video_data:
163
+ logger.error("✗ Validation failed: No video data provided")
164
+ return {"error": "No video data provided. Include video as 'inputs' in request."}
165
 
166
  if not text_prompt:
167
+ logger.error("✗ Validation failed: No text prompt provided")
168
+ return {"error": "No text prompt provided. Include 'text_prompt' in 'parameters'."}
169
+
170
+ if return_format not in ["metadata_only", "base64", "download_url"]:
171
+ logger.warning(f"Invalid return_format '{return_format}', defaulting to 'metadata_only'")
172
+ return_format = "metadata_only"
173
+
174
+ if return_format == "download_url" and not output_repo:
175
+ logger.error("✗ Validation failed: download_url requires output_repo")
176
+ return {"error": "return_format='download_url' requires 'output_repo' parameter"}
177
 
178
+ logger.info(" Request validation passed")
 
179
 
180
  # Process video in temporary directory
181
  with tempfile.TemporaryDirectory() as tmpdir:
182
  tmpdir_path = Path(tmpdir)
183
+ logger.info(f"Created temporary directory: {tmpdir}")
184
 
185
+ # STEP 1: Decode and save video
186
+ logger.info("")
187
+ logger.info("STEP 1/9: Decoding video data...")
188
+ step_start = time.time()
189
 
190
+ try:
191
+ video_path = self._prepare_video(video_data, tmpdir_path)
192
+ video_size_mb = video_path.stat().st_size / 1e6
193
+
194
+ logger.info(f" Video saved to: {video_path}")
195
+ logger.info(f" Video size: {video_size_mb:.2f} MB")
196
+ logger.info(f"✓ Step 1 completed in {time.time() - step_start:.2f}s")
197
+
198
+ except Exception as e:
199
+ logger.error(f"✗ Step 1 failed: {type(e).__name__}: {e}")
200
+ raise
201
 
202
+ # STEP 2: Start SAM3 session
203
+ logger.info("")
204
+ logger.info("STEP 2/9: Starting SAM3 session...")
205
+ step_start = time.time()
206
+
207
+ try:
208
+ response = self.predictor.handle_request(
209
+ request=dict(
210
+ type="start_session",
211
+ resource_path=str(video_path),
212
+ )
213
  )
214
+ session_id = response["session_id"]
215
+
216
+ logger.info(f" Session ID: {session_id}")
217
+ logger.info(f"✓ Step 2 completed in {time.time() - step_start:.2f}s")
218
+
219
+ except Exception as e:
220
+ logger.error(f"✗ Step 2 failed: {type(e).__name__}: {e}")
221
+ raise
222
+
223
+ # STEP 3: Add text prompt
224
+ logger.info("")
225
+ logger.info("STEP 3/9: Adding text prompt to first frame...")
226
+ step_start = time.time()
227
 
228
+ try:
229
+ response = self.predictor.handle_request(
230
+ request=dict(
231
+ type="add_prompt",
232
+ session_id=session_id,
233
+ frame_index=0,
234
+ text=text_prompt,
235
+ )
236
  )
237
+
238
+ logger.info(f" Prompt: '{text_prompt}'")
239
+ logger.info(f" Frame: 0")
240
+ logger.info(f"✓ Step 3 completed in {time.time() - step_start:.2f}s")
241
+
242
+ except Exception as e:
243
+ logger.error(f"✗ Step 3 failed: {type(e).__name__}: {e}")
244
+ raise
245
 
246
+ # STEP 4: Propagate through video
247
+ logger.info("")
248
+ logger.info("STEP 4/9: Propagating segmentation through video...")
249
+ step_start = time.time()
250
+
251
+ try:
252
+ outputs_per_frame = {}
253
+ last_log_frame = -1
254
+ log_interval = 10 # Log every 10 frames
255
+
256
+ for stream_response in self.predictor.handle_stream_request(
257
+ request=dict(
258
+ type="propagate_in_video",
259
+ session_id=session_id,
260
+ )
261
+ ):
262
+ frame_idx = stream_response["frame_index"]
263
+ outputs_per_frame[frame_idx] = stream_response["outputs"]
264
+
265
+ # Log progress every N frames
266
+ if frame_idx - last_log_frame >= log_interval:
267
+ logger.info(f" Processing frame {frame_idx}...")
268
+ last_log_frame = frame_idx
269
+
270
+ logger.info(f" Total frames processed: {len(outputs_per_frame)}")
271
+ logger.info(f"✓ Step 4 completed in {time.time() - step_start:.2f}s")
272
+
273
+ except Exception as e:
274
+ logger.error(f"✗ Step 4 failed: {type(e).__name__}: {e}")
275
+ raise
276
 
277
+ # STEP 5: Save masks to PNG files
278
+ logger.info("")
279
+ logger.info("STEP 5/9: Saving masks to PNG files...")
280
+ step_start = time.time()
281
 
282
+ try:
283
+ masks_dir = tmpdir_path / "masks"
284
+ masks_dir.mkdir()
285
 
286
+ all_object_ids = set()
287
+ mask_count = 0
288
+
289
+ for frame_idx, frame_output in outputs_per_frame.items():
290
+ frame_masks = self._save_frame_masks(frame_output, masks_dir, frame_idx)
291
+ mask_count += frame_masks
 
 
292
 
293
+ # Collect object IDs
294
+ if "object_ids" in frame_output and frame_output["object_ids"] is not None:
295
+ obj_ids = frame_output["object_ids"]
296
+ if torch.is_tensor(obj_ids):
297
+ obj_ids = obj_ids.cpu().tolist()
298
+ elif isinstance(obj_ids, np.ndarray):
299
+ obj_ids = obj_ids.tolist()
300
+
301
+ if isinstance(obj_ids, list):
302
+ all_object_ids.update(obj_ids)
303
+ else:
304
+ all_object_ids.add(obj_ids)
305
+
306
+ logger.info(f" Masks directory: {masks_dir}")
307
+ logger.info(f" Total mask files: {mask_count}")
308
+ logger.info(f" Unique objects: {sorted(list(all_object_ids))}")
309
+ logger.info(f"✓ Step 5 completed in {time.time() - step_start:.2f}s")
310
+
311
+ except Exception as e:
312
+ logger.error(f"✗ Step 5 failed: {type(e).__name__}: {e}")
313
+ raise
314
 
315
+ # STEP 6: Create ZIP archive
316
+ logger.info("")
317
+ logger.info("STEP 6/9: Creating ZIP archive...")
318
+ step_start = time.time()
319
 
320
+ try:
321
+ zip_path = tmpdir_path / "masks.zip"
322
+ self._create_zip(masks_dir, zip_path)
323
+
324
+ zip_size_mb = zip_path.stat().st_size / 1e6
325
+
326
+ logger.info(f" ZIP path: {zip_path}")
327
+ logger.info(f" ZIP size: {zip_size_mb:.2f} MB")
328
+ logger.info(f" Compression ratio: {(1 - zip_size_mb / video_size_mb) * 100:.1f}%")
329
+ logger.info(f"✓ Step 6 completed in {time.time() - step_start:.2f}s")
330
+
331
+ except Exception as e:
332
+ logger.error(f"✗ Step 6 failed: {type(e).__name__}: {e}")
333
+ raise
334
 
335
+ # STEP 7: Get video metadata
336
+ logger.info("")
337
+ logger.info("STEP 7/9: Extracting video metadata...")
338
+ step_start = time.time()
339
+
340
+ try:
341
+ video_metadata = self._get_video_metadata(video_path)
342
+
343
+ for key, value in video_metadata.items():
344
+ logger.info(f" {key}: {value}")
345
+ logger.info(f"✓ Step 7 completed in {time.time() - step_start:.2f}s")
346
+
347
+ except Exception as e:
348
+ logger.warning(f"Step 7 partial failure: {e}")
349
+ video_metadata = {}
350
+
351
+ # STEP 8: Prepare response
352
+ logger.info("")
353
+ logger.info("STEP 8/9: Preparing response...")
354
+ step_start = time.time()
355
 
 
356
  response = {
357
  "frame_count": len(outputs_per_frame),
358
  "objects_detected": sorted(list(all_object_ids)) if all_object_ids else [],
 
361
  }
362
 
363
  if return_format == "download_url" and output_repo:
364
+ logger.info(f" Uploading to HuggingFace dataset: {output_repo}")
365
+ try:
366
+ download_url = self._upload_to_hf(zip_path, output_repo)
367
+ response["download_url"] = download_url
368
+ logger.info(f" ✓ Upload successful: {download_url}")
369
+ except Exception as e:
370
+ logger.error(f" ✗ Upload failed: {e}")
371
+ raise
372
 
373
  elif return_format == "base64":
374
+ logger.info(" Encoding ZIP to base64...")
375
+ try:
376
+ with open(zip_path, "rb") as f:
377
+ zip_bytes = f.read()
378
+ response["masks_zip_base64"] = base64.b64encode(zip_bytes).decode("utf-8")
379
+ logger.info(f" ✓ Encoded {len(response['masks_zip_base64'])} characters")
380
+ except Exception as e:
381
+ logger.error(f" ✗ Encoding failed: {e}")
382
+ raise
383
 
384
  else:
385
+ logger.info(" Returning metadata only (no mask data)")
 
386
 
387
+ logger.info(f"✓ Step 8 completed in {time.time() - step_start:.2f}s")
388
+
389
+ # STEP 9: Close session
390
+ logger.info("")
391
+ logger.info("STEP 9/9: Closing SAM3 session...")
392
+ step_start = time.time()
393
+
394
+ try:
395
+ self.predictor.handle_request(
396
+ request=dict(
397
+ type="close_session",
398
+ session_id=session_id,
399
+ )
400
  )
401
+ logger.info(f"✓ Step 9 completed in {time.time() - step_start:.2f}s")
402
+
403
+ except Exception as e:
404
+ logger.warning(f"Step 9 partial failure (non-critical): {e}")
405
+
406
+ # Final summary
407
+ total_time = time.time() - request_start
408
+ logger.info("")
409
+ logger.info("="*80)
410
+ logger.info("REQUEST COMPLETED SUCCESSFULLY")
411
+ logger.info(f"Total processing time: {total_time:.2f}s")
412
+ logger.info(f"Frames processed: {len(outputs_per_frame)}")
413
+ logger.info(f"Objects detected: {len(all_object_ids)}")
414
+ logger.info("="*80)
415
+ logger.info("")
416
 
417
  return response
418
 
419
  except Exception as e:
420
+ total_time = time.time() - request_start
421
+
422
+ logger.error("")
423
+ logger.error("="*80)
424
+ logger.error("REQUEST FAILED")
425
+ logger.error(f"Error type: {type(e).__name__}")
426
+ logger.error(f"Error message: {str(e)}")
427
+ logger.error(f"Time elapsed: {total_time:.2f}s")
428
+ logger.error("="*80)
429
+ logger.exception("Full traceback:")
430
+ logger.error("")
431
+
432
  return {
433
  "error": str(e),
434
  "error_type": type(e).__name__
 
439
  Ensure BPE tokenizer file exists. Download from HuggingFace if missing.
440
  Returns path to the BPE file.
441
  """
442
+ logger.info("Checking for BPE tokenizer file...")
443
+
444
  # Default expected path
445
  assets_dir = Path("/repository/assets")
446
  bpe_file = assets_dir / "bpe_simple_vocab_16e6.txt.gz"
447
 
448
  if bpe_file.exists():
449
+ logger.info(f" BPE file found: {bpe_file}")
450
  return str(bpe_file)
451
 
452
+ logger.warning(f" BPE file not found at {bpe_file}")
453
+ logger.info(" Downloading from HuggingFace...")
454
 
455
  # Create assets directory
456
  assets_dir.mkdir(parents=True, exist_ok=True)
457
 
458
+ # Try primary method: hf_hub_download
459
  try:
460
  from huggingface_hub import hf_hub_download
461
 
462
+ logger.info(" Attempting download via hf_hub_download...")
463
  downloaded_path = hf_hub_download(
464
  repo_id="facebook/sam3",
465
  filename="assets/bpe_simple_vocab_16e6.txt.gz",
 
467
  local_dir_use_symlinks=False
468
  )
469
 
470
+ logger.info(f" BPE file downloaded: {downloaded_path}")
471
  return downloaded_path
472
 
473
  except Exception as e:
474
+ logger.warning(f" Primary download failed: {e}")
475
+ logger.info(" Trying fallback download method...")
476
 
477
  # Fallback: download directly from raw URL
478
  import urllib.request
479
  url = "https://huggingface.co/facebook/sam3/resolve/main/assets/bpe_simple_vocab_16e6.txt.gz"
480
 
481
  try:
482
+ logger.info(f" Downloading from: {url}")
483
  urllib.request.urlretrieve(url, str(bpe_file))
484
+ logger.info(f" BPE file downloaded: {bpe_file}")
485
  return str(bpe_file)
486
+
487
  except Exception as e2:
488
+ logger.error(f" Fallback download failed: {e2}")
489
  raise ValueError(
490
  f"Could not download BPE tokenizer file. Please add assets/bpe_simple_vocab_16e6.txt.gz "
491
  f"to your repository. Download from: {url}"
 
494
  def _prepare_video(self, video_data: str, tmpdir: Path) -> Path:
495
  """Decode base64 video and save to file."""
496
  try:
497
+ logger.info(" Decoding base64 data...")
498
  video_bytes = base64.b64decode(video_data)
499
+ logger.info(f" Decoded {len(video_bytes)} bytes")
500
+
501
  except Exception as e:
502
+ logger.error(f" Base64 decode failed: {e}")
503
  raise ValueError(f"Failed to decode base64 video: {e}")
504
 
505
  video_path = tmpdir / "input_video.mp4"
 
507
 
508
  return video_path
509
 
510
+ def _save_frame_masks(self, frame_output: Dict, masks_dir: Path, frame_idx: int) -> int:
511
  """
512
  Save masks for a frame as PNG files.
513
  Each object gets its own mask file: frame_XXXX_obj_Y.png
514
+ Returns the number of masks saved.
515
  """
516
  if "masks" not in frame_output or frame_output["masks"] is None:
517
+ return 0
518
 
519
  masks = frame_output["masks"]
520
  object_ids = frame_output.get("object_ids", [])
 
533
 
534
  # Ensure masks is 3D array [num_objects, height, width]
535
  if len(masks.shape) == 4:
 
536
  masks = masks[0]
537
 
538
  # Save each object's mask
539
+ saved_count = 0
540
  for i, obj_id in enumerate(object_ids):
541
  if i < len(masks):
542
  mask = masks[i]
 
548
  mask_img = Image.fromarray(mask_binary)
549
  mask_filename = f"frame_{frame_idx:05d}_obj_{obj_id}.png"
550
  mask_img.save(masks_dir / mask_filename, compress_level=9)
551
+ saved_count += 1
552
+
553
+ return saved_count
554
 
555
  def _create_zip(self, masks_dir: Path, zip_path: Path):
556
  """Create ZIP archive of all mask PNGs."""
557
+ mask_files = sorted(masks_dir.glob("*.png"))
558
+ logger.info(f" Creating ZIP with {len(mask_files)} files...")
559
+
560
  with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=9) as zipf:
561
+ for mask_file in mask_files:
562
  zipf.write(mask_file, mask_file.name)
563
 
564
  def _get_video_metadata(self, video_path: Path) -> Dict[str, Any]:
565
  """Extract video metadata using OpenCV."""
566
  try:
567
  cap = cv2.VideoCapture(str(video_path))
568
+
569
+ if not cap.isOpened():
570
+ logger.warning(f" Could not open video file: {video_path}")
571
+ return {}
572
+
573
  metadata = {
574
  "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
575
  "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
 
577
  "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
578
  }
579
  cap.release()
580
+
581
  return metadata
582
+
583
  except Exception as e:
584
+ logger.warning(f" Could not extract video metadata: {e}")
585
  return {}
586
 
587
  def _upload_to_hf(self, zip_path: Path, repo_id: str) -> str:
 
595
  timestamp = int(time.time())
596
  filename = f"masks_{timestamp}.zip"
597
 
598
+ logger.info(f" Uploading {zip_path.stat().st_size / 1e6:.2f} MB...")
599
+
600
  # Upload file
601
  url = self.hf_api.upload_file(
602
  path_or_fileobj=str(zip_path),
 
610
  return download_url
611
 
612
  except Exception as e:
613
+ logger.error(f" Upload error: {e}")
614
  raise ValueError(f"Failed to upload to HuggingFace: {e}")