Spaces:

devkunalnaik
/

Swapper

Running

App Files Files Community

devkunalnaik commited on May 18

Commit

6faf48e

1 Parent(s): d5e1b6d

Perf: source face detected once, target faces cached every 5 frames, 720p video processing

Browse files

Files changed (2) hide show

processors/face_swap.py +61 -0
processors/video_processor.py +51 -19

processors/face_swap.py CHANGED Viewed

@@ -228,3 +228,64 @@ class FaceSwapper:
         except Exception as exc:
             return None, f"Face swap error: {exc}"

         except Exception as exc:
             return None, f"Face swap error: {exc}"
+    def get_source_face(self, source_bgr: np.ndarray):
+        """
+        Detect and return the first face in *source_bgr*.
+        Call once before a video loop and reuse the result in swap_frame().
+        Returns:
+            face object or None
+        """
+        self._init()
+        faces = self._app.get(source_bgr)
+        return faces[0] if faces else None
+    def swap_frame(
+        self,
+        target_bgr: np.ndarray,
+        source_face,
+        cached_target_faces=None,
+        enhance: bool = False,
+    ):
+        """
+        Fast path for video — reuses a pre-computed source_face and optionally
+        cached target faces (re-detection skipped when supplied).
+        Returns:
+            (result_bgr, target_faces_used)
+        """
+        self._init()
+        # Cap video frames at 720p for speed; quality still good for motion
+        MAX_VIDEO_DIM = 720
+        orig_h, orig_w = target_bgr.shape[:2]
+        scale_down = 1.0
+        if max(orig_h, orig_w) > MAX_VIDEO_DIM:
+            scale_down = MAX_VIDEO_DIM / max(orig_h, orig_w)
+            target_bgr = cv2.resize(
+                target_bgr,
+                (int(orig_w * scale_down), int(orig_h * scale_down)),
+                interpolation=cv2.INTER_LINEAR,
+            )
+        if cached_target_faces is None:
+            target_faces = self._app.get(target_bgr)
+        else:
+            target_faces = cached_target_faces
+        if not target_faces:
+            return None, []
+        result = target_bgr.copy()
+        for tgt_face in target_faces:
+            result = self._swapper.get(result, tgt_face, source_face, paste_back=True)
+        if enhance:
+            result = self._enhance_opencv(result, target_faces)
+        # Scale back up to original frame size
+        if scale_down < 1.0:
+            result = cv2.resize(result, (orig_w, orig_h), interpolation=cv2.INTER_LINEAR)
+        return result, target_faces

processors/video_processor.py CHANGED Viewed

@@ -2,8 +2,14 @@
 Video processor — extracts frames from an input video, applies face or body
 swap to each frame, then re-encodes the result with FFmpeg (audio preserved).
-A hard cap of MAX_FRAMES is enforced to keep processing times reasonable on
-free GPU tiers.
 """
 import cv2
@@ -12,7 +18,8 @@ import tempfile
 import numpy as np
 from pathlib import Path
-MAX_FRAMES = 600  # ~20 s at 30 fps — raise for paid/GPU tiers
 class VideoProcessor:
@@ -45,9 +52,9 @@ class VideoProcessor:
         if not cap.isOpened():
             return None, "Could not open video file."
-        fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         if total_frames > MAX_FRAMES:
@@ -58,14 +65,23 @@ class VideoProcessor:
                 "Please trim the video and try again."
             )
         # Temp file for raw processed frames (mp4v codec)
         raw_out_path = tempfile.mktemp(suffix="_raw.mp4")
-        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-        writer = cv2.VideoWriter(raw_out_path, fourcc, fps, (width, height))
-        frame_idx = 0
-        processed = 0
-        errors = 0
         while True:
             ret, frame = cap.read()
@@ -78,10 +94,19 @@ class VideoProcessor:
                     f"Processing frame {frame_idx + 1} / {total_frames}",
                 )
-            result_frame = self._process_frame(
-                source_bgr, frame, mode, enhance, blend_strength
             )
             if result_frame is not None:
                 writer.write(result_frame)
                 processed += 1
@@ -97,7 +122,6 @@ class VideoProcessor:
         # Re-encode with H.264 and merge original audio via FFmpeg
         final_path = self._ffmpeg_encode(video_path, raw_out_path)
-        # Clean up raw file
         try:
             os.unlink(raw_out_path)
         except OSError:
@@ -119,19 +143,27 @@ class VideoProcessor:
         mode: str,
         enhance: bool,
         blend_strength: float,
-    ) -> np.ndarray | None:
         try:
             if mode == "face" and self.face_swapper:
-                result, _ = self.face_swapper.swap(source_bgr, frame, enhance=enhance)
-                return result
             elif mode == "body" and self.body_swapper:
                 result, _ = self.body_swapper.swap(
                     source_bgr, frame, blend_strength=blend_strength
                 )
-                return result
         except Exception as e:
             print(f"[VideoProcessor] Frame error: {e}")
-        return None
     @staticmethod
     def _ffmpeg_encode(original_video_path: str, processed_raw_path: str) -> str:

 Video processor — extracts frames from an input video, applies face or body
 swap to each frame, then re-encodes the result with FFmpeg (audio preserved).
+Speed optimisations
+-------------------
+* Source face is detected **once** before the loop (never per-frame).
+* Target face detection is cached and reused for DET_INTERVAL frames — faces
+  don't move much between consecutive frames at normal frame rates.
+* Video frames are capped at 720p for processing (upscaled back for writing).
+* A hard cap of MAX_FRAMES is enforced to keep processing times reasonable on
+  free CPU tiers.
 """
 import cv2
 import numpy as np
 from pathlib import Path
+MAX_FRAMES   = 600   # ~20 s at 30 fps
+DET_INTERVAL = 5     # re-detect target faces every N frames
 class VideoProcessor:
         if not cap.isOpened():
             return None, "Could not open video file."
+        fps          = cap.get(cv2.CAP_PROP_FPS) or 25.0
+        width        = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height       = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         if total_frames > MAX_FRAMES:
                 "Please trim the video and try again."
             )
+        # ── Pre-compute source face once (big win for face-swap mode) ─────────
+        source_face = None
+        if mode == "face" and self.face_swapper:
+            source_face = self.face_swapper.get_source_face(source_bgr)
+            if source_face is None:
+                cap.release()
+                return None, "No face detected in source image."
         # Temp file for raw processed frames (mp4v codec)
         raw_out_path = tempfile.mktemp(suffix="_raw.mp4")
+        fourcc       = cv2.VideoWriter_fourcc(*"mp4v")
+        writer       = cv2.VideoWriter(raw_out_path, fourcc, fps, (width, height))
+        frame_idx        = 0
+        processed        = 0
+        errors           = 0
+        cached_tgt_faces = None   # reused across DET_INTERVAL frames
         while True:
             ret, frame = cap.read()
                     f"Processing frame {frame_idx + 1} / {total_frames}",
                 )
+            # Only re-detect target faces every DET_INTERVAL frames
+            use_cache = (mode == "face") and (frame_idx % DET_INTERVAL != 0) and (cached_tgt_faces is not None)
+            result_frame, new_faces = self._process_frame(
+                source_bgr, frame, mode, enhance, blend_strength,
+                source_face=source_face,
+                cached_target_faces=cached_tgt_faces if use_cache else None,
             )
+            # Refresh cache after a detection frame
+            if mode == "face" and new_faces is not None:
+                cached_tgt_faces = new_faces if new_faces else cached_tgt_faces
             if result_frame is not None:
                 writer.write(result_frame)
                 processed += 1
         # Re-encode with H.264 and merge original audio via FFmpeg
         final_path = self._ffmpeg_encode(video_path, raw_out_path)
         try:
             os.unlink(raw_out_path)
         except OSError:
         mode: str,
         enhance: bool,
         blend_strength: float,
+        source_face=None,
+        cached_target_faces=None,
+    ):
+        """Returns (result_frame_or_None, detected_faces_or_None)."""
         try:
             if mode == "face" and self.face_swapper:
+                result, faces = self.face_swapper.swap_frame(
+                    frame,
+                    source_face,
+                    cached_target_faces=cached_target_faces,
+                    enhance=enhance,
+                )
+                return result, faces
             elif mode == "body" and self.body_swapper:
                 result, _ = self.body_swapper.swap(
                     source_bgr, frame, blend_strength=blend_strength
                 )
+                return result, None
         except Exception as e:
             print(f"[VideoProcessor] Frame error: {e}")
+        return None, None
     @staticmethod
     def _ffmpeg_encode(original_video_path: str, processed_raw_path: str) -> str: