Spaces:

nlml
/

sheap

Running on Zero

App Files Files Community

liamsch commited on Dec 1, 2025

Commit

8b1eedf

1 Parent(s): 8455092

speed up video processing by putting frame loading on bg thread

Browse files

Files changed (2) hide show

gradio_demo.py +2 -11
video_demo.py +87 -36

gradio_demo.py CHANGED Viewed

@@ -26,6 +26,7 @@ import torch.hub
 import torchvision.transforms.functional as TF
 from PIL import Image
 from torch.utils.data import DataLoader
 try:
     import spaces
@@ -41,13 +42,7 @@ except ImportError:
 from demo import create_rendering_image
 from sheap import load_sheap_model
 from sheap.tiny_flame import TinyFlame, pose_components_to_rotmats
-try:
-    import face_alignment
-except ImportError:
-    raise ImportError(
-        "The 'face_alignment' package is required. Please install it via 'pip install face-alignment'."
-    )
 from sheap.fa_landmark_utils import detect_face_and_crop
 # Global variables for models (load once)
@@ -148,10 +143,6 @@ def process_image(image: np.ndarray) -> Image.Image:
     return combined
-# --- Import video utilities from video_demo.py ---
-from video_demo import RenderingThread, VideoFrameDataset, _tensor_to_numpy_image
 @spaces.GPU
 def process_video(video_path: str, progress=gr.Progress()) -> str:
     """

 import torchvision.transforms.functional as TF
 from PIL import Image
 from torch.utils.data import DataLoader
+import face_alignment
 try:
     import spaces
 from demo import create_rendering_image
 from sheap import load_sheap_model
 from sheap.tiny_flame import TinyFlame, pose_components_to_rotmats
+from video_demo import RenderingThread, VideoFrameDataset, _tensor_to_numpy_image
 from sheap.fa_landmark_utils import detect_face_and_crop
 # Global variables for models (load once)
     return combined
 @spaces.GPU
 def process_video(video_path: str, progress=gr.Progress()) -> str:
     """

video_demo.py CHANGED Viewed

@@ -106,13 +106,17 @@ class RenderingThread(threading.Thread):
 class VideoFrameDataset(IterableDataset):
-    """Iterable dataset for streaming video frames with face detection and cropping."""
     def __init__(
         self,
         video_path: str,
         fa_model: face_alignment.FaceAlignment,
         smoothing_alpha: float = 0.3,
     ):
         """
         Initialize video frame dataset.
@@ -122,11 +126,13 @@ class VideoFrameDataset(IterableDataset):
             fa_model: FaceAlignment model instance for face detection
             smoothing_alpha: Smoothing factor for bounding box (0=no smoothing, 1=no change).
                            Lower values = more smoothing
         """
         super().__init__()
         self.video_path = video_path
         self.fa_model = fa_model
         self.smoothing_alpha = smoothing_alpha
         self.prev_bbox: Optional[Tuple[int, int, int, int]] = None
         # Get video metadata (don't keep capture open)
@@ -144,9 +150,43 @@ class VideoFrameDataset(IterableDataset):
             f"Video info: {self.num_frames} frames, {self.fps:.2f} fps, {self.width}x{self.height}"
         )
     def __iter__(self):
         """
         Iterate through video frames sequentially.
         Yields:
             Dictionary containing frame_idx, processed image, and bounding box
@@ -154,48 +194,59 @@ class VideoFrameDataset(IterableDataset):
         # Reset smoothing state for new iteration
         self.prev_bbox = None
-        # Open video capture for this iteration
-        cap = cv2.VideoCapture(self.video_path)
-        if not cap.isOpened():
-            raise RuntimeError(f"Could not open video file: {self.video_path}")
-        frame_idx = 0
-        while True:
-            # Read frame
-            ret, frame_bgr = cap.read()
-            if not ret:
-                break
-            # Convert BGR to RGB
-            frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
-            # Convert to torch tensor (C, H, W) with values in [0, 1]
-            image = torch.from_numpy(frame_rgb).permute(2, 0, 1).float() / 255.0
-            # Detect face and crop
-            bbox = detect_face_and_crop(image, self.fa_model, margin=0.9, shift_up=0.5)
-            # Apply smoothing using exponential moving average
-            bbox = self._smooth_bbox(bbox)
-            x0, y0, x1, y1 = bbox
-            cropped = image[:, y0:y1, x0:x1]
-            # Resize to 224x224 for SHEAP model
-            cropped_resized = TF.resize(cropped, [224, 224], antialias=True)
-            cropped_for_render = TF.resize(cropped, [512, 512], antialias=True)
-            yield {
-                "frame_idx": frame_idx,
-                "image": cropped_resized,
-                "bbox": bbox,
-                "original_frame": frame_rgb,  # Keep original for reference (as numpy array)
-                "cropped_frame": cropped_for_render,  # Cropped region resized to 512x512
-            }
-            frame_idx += 1
-        cap.release()
     def _smooth_bbox(self, bbox: Tuple[int, int, int, int]) -> Tuple[int, int, int, int]:
         """Apply exponential moving average smoothing to bounding box."""

 class VideoFrameDataset(IterableDataset):
+    """Iterable dataset for streaming video frames with face detection and cropping.
+    Uses a background thread for video frame loading while face detection runs in the main thread.
+    """
     def __init__(
         self,
         video_path: str,
         fa_model: face_alignment.FaceAlignment,
         smoothing_alpha: float = 0.3,
+        frame_buffer_size: int = 32,
     ):
         """
         Initialize video frame dataset.
             fa_model: FaceAlignment model instance for face detection
             smoothing_alpha: Smoothing factor for bounding box (0=no smoothing, 1=no change).
                            Lower values = more smoothing
+            frame_buffer_size: Size of the frame buffer queue for the background thread
         """
         super().__init__()
         self.video_path = video_path
         self.fa_model = fa_model
         self.smoothing_alpha = smoothing_alpha
+        self.frame_buffer_size = frame_buffer_size
         self.prev_bbox: Optional[Tuple[int, int, int, int]] = None
         # Get video metadata (don't keep capture open)
             f"Video info: {self.num_frames} frames, {self.fps:.2f} fps, {self.width}x{self.height}"
         )
+    def _video_reader_thread(self, frame_queue: Queue, stop_event: threading.Event):
+        """Background thread that reads video frames and puts them in a queue.
+        Args:
+            frame_queue: Queue to put (frame_idx, frame_rgb) tuples
+            stop_event: Event to signal thread to stop
+        """
+        cap = cv2.VideoCapture(self.video_path)
+        if not cap.isOpened():
+            frame_queue.put(("error", f"Could not open video file: {self.video_path}"))
+            return
+        frame_idx = 0
+        try:
+            while not stop_event.is_set():
+                ret, frame_bgr = cap.read()
+                if not ret:
+                    break
+                # Convert BGR to RGB
+                frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+                # Put frame in queue (blocks if queue is full)
+                frame_queue.put((frame_idx, frame_rgb))
+                frame_idx += 1
+        finally:
+            cap.release()
+            # Signal end of video
+            frame_queue.put(None)
     def __iter__(self):
         """
         Iterate through video frames sequentially.
+        Video frame loading happens in a background thread, while face detection
+        and processing happen in the main thread.
         Yields:
             Dictionary containing frame_idx, processed image, and bounding box
         # Reset smoothing state for new iteration
         self.prev_bbox = None
+        # Create queue and start background thread for video reading
+        frame_queue = Queue(maxsize=self.frame_buffer_size)
+        stop_event = threading.Event()
+        reader_thread = threading.Thread(
+            target=self._video_reader_thread,
+            args=(frame_queue, stop_event),
+            daemon=True
+        )
+        reader_thread.start()
+        try:
+            while True:
+                # Get frame from background thread
+                item = frame_queue.get()
+                # Check for end of video
+                if item is None:
+                    break
+                # Check for error
+                if isinstance(item, tuple) and len(item) == 2 and item[0] == "error":
+                    raise RuntimeError(item[1])
+                frame_idx, frame_rgb = item
+                # Convert to torch tensor (C, H, W) with values in [0, 1]
+                image = torch.from_numpy(frame_rgb).permute(2, 0, 1).float() / 255.0
+                # Detect face and crop (runs in main thread, can use GPU)
+                bbox = detect_face_and_crop(image, self.fa_model, margin=0.9, shift_up=0.5)
+                # Apply smoothing using exponential moving average
+                bbox = self._smooth_bbox(bbox)
+                x0, y0, x1, y1 = bbox
+                cropped = image[:, y0:y1, x0:x1]
+                # Resize to 224x224 for SHEAP model
+                cropped_resized = TF.resize(cropped, [224, 224], antialias=True)
+                cropped_for_render = TF.resize(cropped, [512, 512], antialias=True)
+                yield {
+                    "frame_idx": frame_idx,
+                    "image": cropped_resized,
+                    "bbox": bbox,
+                    "original_frame": frame_rgb,  # Keep original for reference (as numpy array)
+                    "cropped_frame": cropped_for_render,  # Cropped region resized to 512x512
+                }
+        finally:
+            # Clean up background thread
+            stop_event.set()
+            reader_thread.join(timeout=1.0)
     def _smooth_bbox(self, bbox: Tuple[int, int, int, int]) -> Tuple[int, int, int, int]:
         """Apply exponential moving average smoothing to bounding box."""