Spaces:

BiasLab2025
/

detection_base

Paused

App Files Files Community

Zhen Ye commited on Feb 25

Commit

c97a5f9

1 Parent(s): 05bd36a

Eliminate redundant JPEG frame loading via shared frame store

Browse files

Files changed (3) hide show

inference.py +84 -27
models/segmenters/grounded_sam2.py +25 -7
utils/frame_store.py +154 -0

inference.py CHANGED Viewed

@@ -1209,14 +1209,18 @@ def _gsam2_render_frame(
     height: int,
     width: int,
     masks_only: bool = False,
 ) -> np.ndarray:
     """Render a single GSAM2 tracking frame (masks + boxes). CPU-only.
     When *masks_only* is True, skip box rendering so the writer thread can
     draw boxes later with enriched (GPT) labels.
     """
-    frame_path = os.path.join(frame_dir, frame_names[frame_idx])
-    frame = cv2.imread(frame_path)
     if frame is None:
         return np.zeros((height, width, 3), dtype=np.uint8)
@@ -1290,6 +1294,7 @@ def run_grounded_sam2_tracking(
     from PIL import Image as PILImage
     from utils.video import extract_frames_to_jpeg_dir
     from models.segmenters.grounded_sam2 import MaskDictionary, ObjectInfo, LazyFrameObjects
     active_segmenter = segmenter_name or "GSAM2-L"
@@ -1305,26 +1310,40 @@ def run_grounded_sam2_tracking(
         active_segmenter, queries, step,
     )
-    # 1. Extract frames to JPEG directory
-    frame_dir = tempfile.mkdtemp(prefix="gsam2_frames_")
     try:
-        if _perf_metrics is not None:
-            _t_e2e = time.perf_counter()
-            if torch.cuda.is_available():
-                torch.cuda.reset_peak_memory_stats()
-        if _perf_metrics is not None:
-            _t_ext = time.perf_counter()
         frame_names, fps, width, height = extract_frames_to_jpeg_dir(
             input_video_path, frame_dir, max_frames=max_frames,
         )
         if _perf_metrics is not None:
             _perf_metrics["frame_extraction_ms"] = (time.perf_counter() - _t_ext) * 1000.0
-        total_frames = len(frame_names)
-        _ttfs(f"frame_extraction done ({total_frames} frames)")
-        logging.info("Extracted %d frames to %s", total_frames, frame_dir)
         num_gpus = torch.cuda.device_count()
@@ -1358,6 +1377,7 @@ def run_grounded_sam2_tracking(
                         frame_dir, frame_names, fidx, fobjs,
                         height, width,
                         masks_only=enable_gpt,
                     )
                     if _perf_metrics is not None:
@@ -1824,6 +1844,7 @@ def run_grounded_sam2_tracking(
                     on_segment_output=_feed_segment_gpu,
                     _ttfs_t0=_ttfs_t0,
                     _ttfs_job_id=job_id,
                 )
                 if _perf_metrics is not None:
@@ -1875,13 +1896,46 @@ def run_grounded_sam2_tracking(
                 if _perf_metrics is not None:
                     _t_init = time.perf_counter()
-                def _init_seg_state(seg):
-                    seg._ensure_models_loaded()
-                    return seg._video_predictor.init_state(
-                        video_path=frame_dir,
-                        offload_video_to_cpu=True,
-                        async_loading_frames=True,
-                    )
                 with ThreadPoolExecutor(max_workers=len(segmenters)) as pool:
                     futs = [pool.submit(_init_seg_state, seg) for seg in segmenters]
@@ -1937,11 +1991,14 @@ def run_grounded_sam2_tracking(
                                     "GPU %d processing segment %d (frame %d)",
                                     gpu_idx, seg_idx, start_idx,
                                 )
-                                img_path = os.path.join(
-                                    frame_dir, frame_names[start_idx]
-                                )
-                                with PILImage.open(img_path) as pil_img:
-                                    image = pil_img.convert("RGB")
                                 if job_id:
                                     _check_cancellation(job_id)

     height: int,
     width: int,
     masks_only: bool = False,
+    frame_store=None,
 ) -> np.ndarray:
     """Render a single GSAM2 tracking frame (masks + boxes). CPU-only.
     When *masks_only* is True, skip box rendering so the writer thread can
     draw boxes later with enriched (GPT) labels.
     """
+    if frame_store is not None:
+        frame = frame_store.get_bgr(frame_idx).copy()  # .copy() — render mutates
+    else:
+        frame_path = os.path.join(frame_dir, frame_names[frame_idx])
+        frame = cv2.imread(frame_path)
     if frame is None:
         return np.zeros((height, width, 3), dtype=np.uint8)
     from PIL import Image as PILImage
     from utils.video import extract_frames_to_jpeg_dir
+    from utils.frame_store import SharedFrameStore, MemoryBudgetExceeded
     from models.segmenters.grounded_sam2 import MaskDictionary, ObjectInfo, LazyFrameObjects
     active_segmenter = segmenter_name or "GSAM2-L"
         active_segmenter, queries, step,
     )
+    # 1. Load frames — prefer in-memory SharedFrameStore, fall back to JPEG dir
+    _use_frame_store = True
+    frame_store = None
+    _t_ext = time.perf_counter()
     try:
+        frame_store = SharedFrameStore(input_video_path, max_frames=max_frames)
+        fps, width, height = frame_store.fps, frame_store.width, frame_store.height
+        total_frames = len(frame_store)
+        frame_names = [f"{i:06d}.jpg" for i in range(total_frames)]
+        # Write single dummy JPEG for init_state bootstrapping
+        dummy_frame_dir = tempfile.mkdtemp(prefix="gsam2_dummy_")
+        cv2.imwrite(os.path.join(dummy_frame_dir, "000000.jpg"), frame_store.get_bgr(0))
+        frame_dir = dummy_frame_dir
+        logging.info("SharedFrameStore: %d frames in memory (dummy dir: %s)", total_frames, frame_dir)
+    except MemoryBudgetExceeded:
+        logging.info("Memory budget exceeded, falling back to JPEG extraction")
+        _use_frame_store = False
+        frame_store = None
+        frame_dir = tempfile.mkdtemp(prefix="gsam2_frames_")
         frame_names, fps, width, height = extract_frames_to_jpeg_dir(
             input_video_path, frame_dir, max_frames=max_frames,
         )
+        total_frames = len(frame_names)
+    try:
         if _perf_metrics is not None:
+            _t_e2e = time.perf_counter()
+            if torch.cuda.is_available():
+                torch.cuda.reset_peak_memory_stats()
             _perf_metrics["frame_extraction_ms"] = (time.perf_counter() - _t_ext) * 1000.0
+        _ttfs(f"frame_extraction done ({total_frames} frames, in_memory={_use_frame_store})")
+        logging.info("Loaded %d frames (in_memory=%s)", total_frames, _use_frame_store)
         num_gpus = torch.cuda.device_count()
                         frame_dir, frame_names, fidx, fobjs,
                         height, width,
                         masks_only=enable_gpt,
+                        frame_store=frame_store,
                     )
                     if _perf_metrics is not None:
                     on_segment_output=_feed_segment_gpu,
                     _ttfs_t0=_ttfs_t0,
                     _ttfs_job_id=job_id,
+                    frame_store=frame_store,
                 )
                 if _perf_metrics is not None:
                 if _perf_metrics is not None:
                     _t_init = time.perf_counter()
+                if frame_store is not None:
+                    # Models are lazy-loaded; ensure at least one is ready so we
+                    # can read image_size.  Phase 1 (load_segmenter_on_device)
+                    # only constructs the object — _video_predictor is still None.
+                    segmenters[0]._ensure_models_loaded()
+                    sam2_img_size = segmenters[0]._video_predictor.image_size
+                    # Pre-create the shared adapter (validates memory budget)
+                    shared_adapter = frame_store.sam2_adapter(image_size=sam2_img_size)
+                    _REQUIRED_KEYS = {"images", "num_frames", "video_height", "video_width", "cached_features"}
+                    def _init_seg_state(seg):
+                        seg._ensure_models_loaded()
+                        state = seg._video_predictor.init_state(
+                            video_path=frame_dir,  # dummy dir with 1 JPEG
+                            offload_video_to_cpu=True,
+                            async_loading_frames=False,  # 1 dummy frame, instant
+                        )
+                        # Validate expected keys exist before patching
+                        missing = _REQUIRED_KEYS - set(state.keys())
+                        if missing:
+                            raise RuntimeError(f"SAM2 init_state missing expected keys: {missing}")
+                        # CRITICAL: Clear cached_features BEFORE patching images
+                        # init_state caches dummy frame 0's backbone features — must evict
+                        state["cached_features"] = {}
+                        # Patch in real frame data
+                        state["images"] = shared_adapter
+                        state["num_frames"] = total_frames
+                        state["video_height"] = height
+                        state["video_width"] = width
+                        return state
+                else:
+                    def _init_seg_state(seg):
+                        seg._ensure_models_loaded()
+                        return seg._video_predictor.init_state(
+                            video_path=frame_dir,
+                            offload_video_to_cpu=True,
+                            async_loading_frames=True,
+                        )
                 with ThreadPoolExecutor(max_workers=len(segmenters)) as pool:
                     futs = [pool.submit(_init_seg_state, seg) for seg in segmenters]
                                     "GPU %d processing segment %d (frame %d)",
                                     gpu_idx, seg_idx, start_idx,
                                 )
+                                if frame_store is not None:
+                                    image = frame_store.get_pil_rgb(start_idx)
+                                else:
+                                    img_path = os.path.join(
+                                        frame_dir, frame_names[start_idx]
+                                    )
+                                    with PILImage.open(img_path) as pil_img:
+                                        image = pil_img.convert("RGB")
                                 if job_id:
                                     _check_cancellation(job_id)

models/segmenters/grounded_sam2.py CHANGED Viewed

@@ -717,6 +717,7 @@ class GroundedSAM2Segmenter(Segmenter):
         on_segment_output: Optional[Callable[["SegmentOutput"], None]] = None,
         _ttfs_t0: Optional[float] = None,
         _ttfs_job_id: Optional[str] = None,
     ) -> Dict[int, Dict[int, ObjectInfo]]:
         """Run full Grounded-SAM-2 tracking pipeline on extracted JPEG frames.
@@ -758,11 +759,26 @@ class GroundedSAM2Segmenter(Segmenter):
             if _pm is not None:
                 _t_init = time.perf_counter()
-            inference_state = self._video_predictor.init_state(
-                video_path=frame_dir,
-                offload_video_to_cpu=True,
-                async_loading_frames=True,
-            )
             if _pm is not None:
                 _pl = getattr(self, '_perf_lock', None)
@@ -775,8 +791,10 @@ class GroundedSAM2Segmenter(Segmenter):
             for start_idx in range(0, total_frames, step):
                 logging.info("Processing keyframe %d / %d", start_idx, total_frames)
-                img_path = os.path.join(frame_dir, frame_names[start_idx])
-                image = Image.open(img_path).convert("RGB")
                 mask_dict = MaskDictionary()

         on_segment_output: Optional[Callable[["SegmentOutput"], None]] = None,
         _ttfs_t0: Optional[float] = None,
         _ttfs_job_id: Optional[str] = None,
+        frame_store=None,
     ) -> Dict[int, Dict[int, ObjectInfo]]:
         """Run full Grounded-SAM-2 tracking pipeline on extracted JPEG frames.
             if _pm is not None:
                 _t_init = time.perf_counter()
+            if frame_store is not None:
+                inference_state = self._video_predictor.init_state(
+                    video_path=frame_dir,  # dummy dir with 1 JPEG
+                    offload_video_to_cpu=True,
+                    async_loading_frames=False,
+                )
+                # Clear cached_features (dummy frame 0's backbone features)
+                inference_state["cached_features"] = {}
+                # Patch in real frame data
+                img_size = self._video_predictor.image_size
+                inference_state["images"] = frame_store.sam2_adapter(image_size=img_size)
+                inference_state["num_frames"] = len(frame_store)
+                inference_state["video_height"] = frame_store.height
+                inference_state["video_width"] = frame_store.width
+            else:
+                inference_state = self._video_predictor.init_state(
+                    video_path=frame_dir,
+                    offload_video_to_cpu=True,
+                    async_loading_frames=True,
+                )
             if _pm is not None:
                 _pl = getattr(self, '_perf_lock', None)
             for start_idx in range(0, total_frames, step):
                 logging.info("Processing keyframe %d / %d", start_idx, total_frames)
+                if frame_store is not None:
+                    image = frame_store.get_pil_rgb(start_idx)
+                else:
+                    image = Image.open(os.path.join(frame_dir, frame_names[start_idx])).convert("RGB")
                 mask_dict = MaskDictionary()

utils/frame_store.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""In-memory shared frame store to eliminate redundant JPEG encoding/decoding.
+Replaces the pipeline:
+    MP4 → cv2 decode → JPEG encode to disk → N GPUs each decode all JPEGs back
+With:
+    MP4 → cv2 decode once → SharedFrameStore in RAM → all GPUs read from same memory
+"""
+import logging
+from typing import Optional
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+class MemoryBudgetExceeded(Exception):
+    """Raised when estimated memory usage exceeds the configured ceiling."""
+    def __init__(self, estimated_bytes: int):
+        self.estimated_bytes = estimated_bytes
+        super().__init__(
+            f"Estimated memory {estimated_bytes / 1024**3:.1f} GiB exceeds budget"
+        )
+class SharedFrameStore:
+    """Read-only in-memory store for decoded video frames (BGR uint8).
+    Decodes the video once via cv2.VideoCapture and holds all frames in a list.
+    Thread-safe for concurrent reads (frames list is never mutated after init).
+    Raises MemoryBudgetExceeded BEFORE decoding if estimated memory exceeds
+    the budget ceiling, giving callers a chance to fall back to JPEG path.
+    """
+    MAX_BUDGET_BYTES = 12 * 1024**3  # 12 GiB ceiling
+    def __init__(self, video_path: str, max_frames: Optional[int] = None):
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {video_path}")
+        self.fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
+        self.width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        self.height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        # Estimate frame count BEFORE decoding to check memory budget
+        reported_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        if reported_count <= 0:
+            reported_count = 10000  # conservative fallback
+        est_frames = min(reported_count, max_frames) if max_frames else reported_count
+        # Budget: raw BGR frames + worst-case SAM2 adapter tensors (image_size=1024)
+        per_frame_raw = self.height * self.width * 3  # uint8 BGR
+        per_frame_adapter = 3 * 1024 * 1024 * 4  # float32, worst-case 1024x1024
+        total_est = est_frames * (per_frame_raw + per_frame_adapter)
+        if total_est > self.MAX_BUDGET_BYTES:
+            cap.release()
+            logging.warning(
+                "SharedFrameStore: estimated ~%.1f GiB for %d frames exceeds "
+                "%.1f GiB budget; skipping in-memory path",
+                total_est / 1024**3, est_frames, self.MAX_BUDGET_BYTES / 1024**3,
+            )
+            raise MemoryBudgetExceeded(total_est)
+        frames = []
+        while True:
+            if max_frames is not None and len(frames) >= max_frames:
+                break
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frames.append(frame)
+        cap.release()
+        if not frames:
+            raise RuntimeError(f"No frames decoded from: {video_path}")
+        self.frames = frames
+        logging.info(
+            "SharedFrameStore: %d frames, %dx%d, %.1f fps",
+            len(self.frames), self.width, self.height, self.fps,
+        )
+    def __len__(self) -> int:
+        return len(self.frames)
+    def get_bgr(self, idx: int) -> np.ndarray:
+        """Return BGR frame. Caller must .copy() if mutating."""
+        return self.frames[idx]
+    def get_pil_rgb(self, idx: int) -> Image.Image:
+        """Return PIL RGB Image for the given frame index."""
+        bgr = self.frames[idx]
+        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+        return Image.fromarray(rgb)
+    def sam2_adapter(self, image_size: int) -> "SAM2FrameAdapter":
+        """Factory for SAM2-compatible frame adapter. Returns same adapter for same size."""
+        if not hasattr(self, "_adapters"):
+            self._adapters = {}
+        if image_size not in self._adapters:
+            self._adapters[image_size] = SAM2FrameAdapter(self, image_size)
+        return self._adapters[image_size]
+class SAM2FrameAdapter:
+    """Drop-in replacement for SAM2's AsyncVideoFrameLoader.
+    Matches the interface that SAM2's init_state / propagate_in_video expects:
+    - __len__() → number of frames
+    - __getitem__(idx) → normalized float32 tensor (3, H, W)
+    - .images list (SAM2 accesses this directly in some paths)
+    - .video_height, .video_width
+    - .exception (AsyncVideoFrameLoader compat)
+    Transform parity: uses PIL Image.resize() with BICUBIC (the default),
+    matching SAM2's _load_img_as_tensor exactly.
+    """
+    def __init__(self, store: SharedFrameStore, image_size: int):
+        self._store = store
+        self._image_size = image_size
+        self.images = [None] * len(store)  # SAM2 accesses .images directly
+        self.video_height = store.height
+        self.video_width = store.width
+        self.exception = None  # AsyncVideoFrameLoader compat
+        # ImageNet normalization constants (must match SAM2's _load_img_as_tensor)
+        self._mean = torch.tensor([0.485, 0.456, 0.406]).reshape(3, 1, 1)
+        self._std = torch.tensor([0.229, 0.224, 0.225]).reshape(3, 1, 1)
+    def __len__(self) -> int:
+        return len(self._store)
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        if self.images[idx] is not None:
+            return self.images[idx]
+        # TRANSFORM PARITY: Must match SAM2's _load_img_as_tensor exactly.
+        # SAM2 does: PIL Image → .convert("RGB") → .resize((size, size)) → /255 → permute → normalize
+        # PIL.resize default = BICUBIC. We must use PIL resize, NOT cv2.resize.
+        bgr = self._store.get_bgr(idx)
+        pil_img = Image.fromarray(cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB))
+        pil_resized = pil_img.resize(
+            (self._image_size, self._image_size)
+        )  # BICUBIC default
+        img_np = np.array(pil_resized) / 255.0
+        img = torch.from_numpy(img_np).permute(2, 0, 1).float()
+        img = (img - self._mean) / self._std
+        self.images[idx] = img
+        return img