Spaces:

gyrus2
/

lip-sync-generator

Running

App Files Files Community

gyrus2 commited on Nov 13, 2025

Commit

df17517

verified ·

1 Parent(s): fcb16b9

Add fallback lip-sync algorithm using amplitude-driven mouth animation and update README accordingly

Browse files

Files changed (2) hide show

app.py +31 -25
requirements.txt +4 -2

app.py CHANGED Viewed

@@ -232,13 +232,14 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
     """
     Create a basic talking head animation without neural networks.
-    The fallback algorithm estimates speech activity from the audio's RMS
-    amplitude and animates the avatar by vertically scaling the mouth region
-    accordingly.  The mouth is approximated as a box located in the lower
-    portion of the image.  Each frame is generated by resizing this region
-    based on the normalised amplitude for that time slice.  The resulting
-    frames are compiled into a video using MoviePy and the original audio is
-    attached.
     Parameters
     ----------
@@ -254,14 +255,15 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
     Path
         Path to the generated video file.
     """
-    import cv2  # imported here to avoid mandatory dependency for users who provide Wav2Lip models
     import moviepy.editor as mpy
-    # Load avatar image (BGR)
-    img = cv2.imread(str(image_path))
-    if img is None:
         raise RuntimeError("Failed to load the avatar image. Please ensure the file is a valid image.")
-    height, width, _ = img.shape
     # Approximate mouth bounding box (tune proportions if necessary)
     mouth_w = int(width * 0.6)
     mouth_h = int(height * 0.15)
@@ -294,32 +296,36 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
     for amp in amplitudes:
         # Compute scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
         factor = 1.0 + amp * 0.6
-        frame_bgr = img.copy()
-        # Extract mouth ROI
-        roi = frame_bgr[mouth_y : mouth_y + mouth_h, mouth_x : mouth_x + mouth_w]
         # Scale ROI vertically
         new_h = max(1, int(mouth_h * factor))
-        scaled = cv2.resize(roi, (mouth_w, new_h), interpolation=cv2.INTER_LINEAR)
-        # Determine overlay region bounds (ensure we don't write outside image)
-        end_y = min(height, mouth_y + new_h)
-        overlay = scaled[: end_y - mouth_y, :, :]
-        frame_bgr[mouth_y:end_y, mouth_x : mouth_x + mouth_w] = overlay
-        # Convert to RGB for MoviePy
-        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
-        frames.append(frame_rgb)
     # Use MoviePy to assemble the video and attach audio
     outputs_dir = Path("outputs")
     outputs_dir.mkdir(exist_ok=True)
     output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
     clip = mpy.ImageSequenceClip(frames, fps=fps)
-    # Attach audio
     audio_clip = mpy.AudioFileClip(str(audio_path))
     # Trim audio to match video length if necessary
     min_duration = min(clip.duration, audio_clip.duration)
     clip = clip.set_audio(audio_clip.subclip(0, min_duration))
     clip = clip.set_duration(min_duration)
-    # Write out using H.264 codec and AAC audio.  Use preset ultrafast to reduce CPU usage.
     clip.write_videofile(str(output_path), codec="libx264", audio_codec="aac", fps=fps, preset="ultrafast")
     return output_path

     """
     Create a basic talking head animation without neural networks.
+    This fallback implementation avoids heavy dependencies such as OpenCV by
+    relying on Pillow to manipulate the avatar image.  It estimates speech
+    activity from the audio's RMS amplitude and animates the avatar by
+    vertically stretching the mouth region.  Each frame is generated by
+    resizing this region using Pillow and then compiled into a video via
+    MoviePy.  Because MoviePy uses a bundled FFmpeg binary via
+    ``imageio-ffmpeg``, this should work even if system FFmpeg is not
+    installed.
     Parameters
     ----------
     Path
         Path to the generated video file.
     """
+    from PIL import Image  # Pillow for image manipulation
     import moviepy.editor as mpy
+    # Load avatar image (RGB)
+    try:
+        img = Image.open(str(image_path)).convert("RGB")
+    except Exception:
         raise RuntimeError("Failed to load the avatar image. Please ensure the file is a valid image.")
+    width, height = img.size
     # Approximate mouth bounding box (tune proportions if necessary)
     mouth_w = int(width * 0.6)
     mouth_h = int(height * 0.15)
     for amp in amplitudes:
         # Compute scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
         factor = 1.0 + amp * 0.6
+        # Start from a copy of the base image
+        frame_img = img.copy()
+        # Crop mouth region
+        roi = img.crop((mouth_x, mouth_y, mouth_x + mouth_w, mouth_y + mouth_h))
         # Scale ROI vertically
         new_h = max(1, int(mouth_h * factor))
+        scaled = roi.resize((mouth_w, new_h), Image.BILINEAR)
+        # Compute overlay height (do not exceed image bounds)
+        end_y = mouth_y + new_h
+        if end_y > height:
+            # Trim scaled ROI if it would overflow beyond the image bottom
+            trim_h = height - mouth_y
+            scaled = scaled.crop((0, 0, mouth_w, trim_h))
+            end_y = height
+        # Paste scaled ROI onto frame
+        frame_img.paste(scaled, (mouth_x, mouth_y))
+        # Convert to numpy array for MoviePy (RGB)
+        frames.append(np.array(frame_img))
     # Use MoviePy to assemble the video and attach audio
     outputs_dir = Path("outputs")
     outputs_dir.mkdir(exist_ok=True)
     output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
     clip = mpy.ImageSequenceClip(frames, fps=fps)
     audio_clip = mpy.AudioFileClip(str(audio_path))
     # Trim audio to match video length if necessary
     min_duration = min(clip.duration, audio_clip.duration)
     clip = clip.set_audio(audio_clip.subclip(0, min_duration))
     clip = clip.set_duration(min_duration)
+    # Write out using H.264 codec and AAC audio.  MoviePy will use imageio-ffmpeg's bundled FFmpeg.
     clip.write_videofile(str(output_path), codec="libx264", audio_codec="aac", fps=fps, preset="ultrafast")
     return output_path

requirements.txt CHANGED Viewed

@@ -21,5 +21,7 @@ tqdm
 # FFmpeg bindings (used by moviepy/pydub).  Note: the FFmpeg binary is provided by the Spaces environment.
 ffmpeg-python
-# Optional: OpenCV for future enhancements (not strictly required by the current app but lightweight)
-opencv-python

 # FFmpeg bindings (used by moviepy/pydub).  Note: the FFmpeg binary is provided by the Spaces environment.
 ffmpeg-python
+# Pillow is used for image processing in the fallback lip‑sync implementation.
+pillow
+# Remove OpenCV because the fallback algorithm now uses Pillow exclusively.