Spaces:

gyrus2
/

lip-sync-generator

Running

App Files Files Community

gyrus2 commited on Nov 13, 2025

Commit

be38427

verified ·

1 Parent(s): df17517

Use system ffmpeg for fallback lip-sync; generate frames and encode via ffmpeg

Browse files

Files changed (1) hide show

app.py +64 -42

app.py CHANGED Viewed

@@ -232,14 +232,14 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
     """
     Create a basic talking head animation without neural networks.
-    This fallback implementation avoids heavy dependencies such as OpenCV by
-    relying on Pillow to manipulate the avatar image.  It estimates speech
-    activity from the audio's RMS amplitude and animates the avatar by
-    vertically stretching the mouth region.  Each frame is generated by
-    resizing this region using Pillow and then compiled into a video via
-    MoviePy.  Because MoviePy uses a bundled FFmpeg binary via
-    ``imageio-ffmpeg``, this should work even if system FFmpeg is not
-    installed.
     Parameters
     ----------
@@ -256,7 +256,6 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
         Path to the generated video file.
     """
     from PIL import Image  # Pillow for image manipulation
-    import moviepy.editor as mpy
     # Load avatar image (RGB)
     try:
@@ -278,7 +277,7 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
         samples = samples.reshape((-1, audio.channels)).mean(axis=1)
     frame_size = int(audio.frame_rate / fps)
     n_frames = max(int(len(samples) / frame_size), 1)
-    amplitudes = []
     for i in range(n_frames):
         segment = samples[i * frame_size : (i + 1) * frame_size]
         if segment.size == 0:
@@ -290,43 +289,66 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
     max_amp = max(amplitudes) if amplitudes else 1.0
     if max_amp == 0:
         max_amp = 1.0
     amplitudes = [amp / max_amp for amp in amplitudes]
-    frames = []
-    for amp in amplitudes:
-        # Compute scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
-        factor = 1.0 + amp * 0.6
-        # Start from a copy of the base image
-        frame_img = img.copy()
-        # Crop mouth region
-        roi = img.crop((mouth_x, mouth_y, mouth_x + mouth_w, mouth_y + mouth_h))
-        # Scale ROI vertically
-        new_h = max(1, int(mouth_h * factor))
-        scaled = roi.resize((mouth_w, new_h), Image.BILINEAR)
-        # Compute overlay height (do not exceed image bounds)
-        end_y = mouth_y + new_h
-        if end_y > height:
-            # Trim scaled ROI if it would overflow beyond the image bottom
-            trim_h = height - mouth_y
-            scaled = scaled.crop((0, 0, mouth_w, trim_h))
-            end_y = height
-        # Paste scaled ROI onto frame
-        frame_img.paste(scaled, (mouth_x, mouth_y))
-        # Convert to numpy array for MoviePy (RGB)
-        frames.append(np.array(frame_img))
-    # Use MoviePy to assemble the video and attach audio
     outputs_dir = Path("outputs")
     outputs_dir.mkdir(exist_ok=True)
     output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
-    clip = mpy.ImageSequenceClip(frames, fps=fps)
-    audio_clip = mpy.AudioFileClip(str(audio_path))
-    # Trim audio to match video length if necessary
-    min_duration = min(clip.duration, audio_clip.duration)
-    clip = clip.set_audio(audio_clip.subclip(0, min_duration))
-    clip = clip.set_duration(min_duration)
-    # Write out using H.264 codec and AAC audio.  MoviePy will use imageio-ffmpeg's bundled FFmpeg.
-    clip.write_videofile(str(output_path), codec="libx264", audio_codec="aac", fps=fps, preset="ultrafast")
     return output_path

     """
     Create a basic talking head animation without neural networks.
+    This fallback implementation estimates speech activity from the audio's
+    root‑mean‑square (RMS) amplitude and stretches the mouth region of the
+    avatar image accordingly.  Frames are saved to a temporary directory and
+    then stitched together with the original audio via the system ``ffmpeg``
+    binary.  This avoids heavy Python dependencies (like OpenCV and
+    MoviePy) and works in network‑restricted environments as long as
+    ``ffmpeg`` is available (it is installed by default on Hugging Face
+    Spaces CPU images).
     Parameters
     ----------
         Path to the generated video file.
     """
     from PIL import Image  # Pillow for image manipulation
     # Load avatar image (RGB)
     try:
         samples = samples.reshape((-1, audio.channels)).mean(axis=1)
     frame_size = int(audio.frame_rate / fps)
     n_frames = max(int(len(samples) / frame_size), 1)
+    amplitudes: list[float] = []
     for i in range(n_frames):
         segment = samples[i * frame_size : (i + 1) * frame_size]
         if segment.size == 0:
     max_amp = max(amplitudes) if amplitudes else 1.0
     if max_amp == 0:
         max_amp = 1.0
+    # Normalise amplitudes to [0, 1]
     amplitudes = [amp / max_amp for amp in amplitudes]
+    # Prepare output paths
     outputs_dir = Path("outputs")
     outputs_dir.mkdir(exist_ok=True)
     output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
+    # Create temporary directory for frames
+    with tempfile.TemporaryDirectory() as tmpdir:
+        frames_dir = Path(tmpdir)
+        # Generate each frame
+        for idx, amp in enumerate(amplitudes):
+            # Scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
+            factor = 1.0 + amp * 0.6
+            # Start from a copy of the base image
+            frame_img = img.copy()
+            # Crop mouth region from the base image
+            roi = img.crop((mouth_x, mouth_y, mouth_x + mouth_w, mouth_y + mouth_h))
+            # Scale ROI vertically
+            new_h = max(1, int(mouth_h * factor))
+            scaled = roi.resize((mouth_w, new_h), Image.BILINEAR)
+            # Compute overlay height (do not exceed image bounds)
+            end_y = mouth_y + new_h
+            if end_y > height:
+                # Trim scaled ROI if it would overflow beyond the image bottom
+                trim_h = height - mouth_y
+                scaled = scaled.crop((0, 0, mouth_w, trim_h))
+                end_y = height
+            # Paste scaled ROI onto frame
+            frame_img.paste(scaled, (mouth_x, mouth_y))
+            # Save frame as PNG
+            frame_filename = frames_dir / f"frame_{idx:04d}.png"
+            frame_img.save(frame_filename)
+        # Assemble video using ffmpeg.  The -shortest flag ensures that the
+        # output ends when the shorter of the audio or video streams ends.  Use
+        # -loglevel error to suppress verbose output.
+        cmd = [
+            "ffmpeg",
+            "-y",  # overwrite existing file
+            "-loglevel", "error",
+            "-framerate", str(fps),
+            "-i", str(frames_dir / "frame_%04d.png"),
+            "-i", str(audio_path),
+            "-c:v", "libx264",
+            "-pix_fmt", "yuv420p",
+            "-c:a", "aac",
+            "-shortest",
+            str(output_path),
+        ]
+        try:
+            subprocess.run(cmd, check=True)
+        except Exception as e:
+            # If ffmpeg fails (e.g. missing binary), raise a user‑visible error
+            raise RuntimeError(
+                f"Failed to assemble video with ffmpeg: {e}. "
+                "Ensure that the ffmpeg binary is available in the environment."
+            )
     return output_path