gyrus2 commited on
Commit
df17517
·
verified ·
1 Parent(s): fcb16b9

Add fallback lip-sync algorithm using amplitude-driven mouth animation and update README accordingly

Browse files
Files changed (2) hide show
  1. app.py +31 -25
  2. requirements.txt +4 -2
app.py CHANGED
@@ -232,13 +232,14 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
232
  """
233
  Create a basic talking head animation without neural networks.
234
 
235
- The fallback algorithm estimates speech activity from the audio's RMS
236
- amplitude and animates the avatar by vertically scaling the mouth region
237
- accordingly. The mouth is approximated as a box located in the lower
238
- portion of the image. Each frame is generated by resizing this region
239
- based on the normalised amplitude for that time slice. The resulting
240
- frames are compiled into a video using MoviePy and the original audio is
241
- attached.
 
242
 
243
  Parameters
244
  ----------
@@ -254,14 +255,15 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
254
  Path
255
  Path to the generated video file.
256
  """
257
- import cv2 # imported here to avoid mandatory dependency for users who provide Wav2Lip models
258
  import moviepy.editor as mpy
259
 
260
- # Load avatar image (BGR)
261
- img = cv2.imread(str(image_path))
262
- if img is None:
 
263
  raise RuntimeError("Failed to load the avatar image. Please ensure the file is a valid image.")
264
- height, width, _ = img.shape
265
  # Approximate mouth bounding box (tune proportions if necessary)
266
  mouth_w = int(width * 0.6)
267
  mouth_h = int(height * 0.15)
@@ -294,32 +296,36 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
294
  for amp in amplitudes:
295
  # Compute scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
296
  factor = 1.0 + amp * 0.6
297
- frame_bgr = img.copy()
298
- # Extract mouth ROI
299
- roi = frame_bgr[mouth_y : mouth_y + mouth_h, mouth_x : mouth_x + mouth_w]
 
300
  # Scale ROI vertically
301
  new_h = max(1, int(mouth_h * factor))
302
- scaled = cv2.resize(roi, (mouth_w, new_h), interpolation=cv2.INTER_LINEAR)
303
- # Determine overlay region bounds (ensure we don't write outside image)
304
- end_y = min(height, mouth_y + new_h)
305
- overlay = scaled[: end_y - mouth_y, :, :]
306
- frame_bgr[mouth_y:end_y, mouth_x : mouth_x + mouth_w] = overlay
307
- # Convert to RGB for MoviePy
308
- frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
309
- frames.append(frame_rgb)
 
 
 
 
310
 
311
  # Use MoviePy to assemble the video and attach audio
312
  outputs_dir = Path("outputs")
313
  outputs_dir.mkdir(exist_ok=True)
314
  output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
315
  clip = mpy.ImageSequenceClip(frames, fps=fps)
316
- # Attach audio
317
  audio_clip = mpy.AudioFileClip(str(audio_path))
318
  # Trim audio to match video length if necessary
319
  min_duration = min(clip.duration, audio_clip.duration)
320
  clip = clip.set_audio(audio_clip.subclip(0, min_duration))
321
  clip = clip.set_duration(min_duration)
322
- # Write out using H.264 codec and AAC audio. Use preset ultrafast to reduce CPU usage.
323
  clip.write_videofile(str(output_path), codec="libx264", audio_codec="aac", fps=fps, preset="ultrafast")
324
  return output_path
325
 
 
232
  """
233
  Create a basic talking head animation without neural networks.
234
 
235
+ This fallback implementation avoids heavy dependencies such as OpenCV by
236
+ relying on Pillow to manipulate the avatar image. It estimates speech
237
+ activity from the audio's RMS amplitude and animates the avatar by
238
+ vertically stretching the mouth region. Each frame is generated by
239
+ resizing this region using Pillow and then compiled into a video via
240
+ MoviePy. Because MoviePy uses a bundled FFmpeg binary via
241
+ ``imageio-ffmpeg``, this should work even if system FFmpeg is not
242
+ installed.
243
 
244
  Parameters
245
  ----------
 
255
  Path
256
  Path to the generated video file.
257
  """
258
+ from PIL import Image # Pillow for image manipulation
259
  import moviepy.editor as mpy
260
 
261
+ # Load avatar image (RGB)
262
+ try:
263
+ img = Image.open(str(image_path)).convert("RGB")
264
+ except Exception:
265
  raise RuntimeError("Failed to load the avatar image. Please ensure the file is a valid image.")
266
+ width, height = img.size
267
  # Approximate mouth bounding box (tune proportions if necessary)
268
  mouth_w = int(width * 0.6)
269
  mouth_h = int(height * 0.15)
 
296
  for amp in amplitudes:
297
  # Compute scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
298
  factor = 1.0 + amp * 0.6
299
+ # Start from a copy of the base image
300
+ frame_img = img.copy()
301
+ # Crop mouth region
302
+ roi = img.crop((mouth_x, mouth_y, mouth_x + mouth_w, mouth_y + mouth_h))
303
  # Scale ROI vertically
304
  new_h = max(1, int(mouth_h * factor))
305
+ scaled = roi.resize((mouth_w, new_h), Image.BILINEAR)
306
+ # Compute overlay height (do not exceed image bounds)
307
+ end_y = mouth_y + new_h
308
+ if end_y > height:
309
+ # Trim scaled ROI if it would overflow beyond the image bottom
310
+ trim_h = height - mouth_y
311
+ scaled = scaled.crop((0, 0, mouth_w, trim_h))
312
+ end_y = height
313
+ # Paste scaled ROI onto frame
314
+ frame_img.paste(scaled, (mouth_x, mouth_y))
315
+ # Convert to numpy array for MoviePy (RGB)
316
+ frames.append(np.array(frame_img))
317
 
318
  # Use MoviePy to assemble the video and attach audio
319
  outputs_dir = Path("outputs")
320
  outputs_dir.mkdir(exist_ok=True)
321
  output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
322
  clip = mpy.ImageSequenceClip(frames, fps=fps)
 
323
  audio_clip = mpy.AudioFileClip(str(audio_path))
324
  # Trim audio to match video length if necessary
325
  min_duration = min(clip.duration, audio_clip.duration)
326
  clip = clip.set_audio(audio_clip.subclip(0, min_duration))
327
  clip = clip.set_duration(min_duration)
328
+ # Write out using H.264 codec and AAC audio. MoviePy will use imageio-ffmpeg's bundled FFmpeg.
329
  clip.write_videofile(str(output_path), codec="libx264", audio_codec="aac", fps=fps, preset="ultrafast")
330
  return output_path
331
 
requirements.txt CHANGED
@@ -21,5 +21,7 @@ tqdm
21
  # FFmpeg bindings (used by moviepy/pydub). Note: the FFmpeg binary is provided by the Spaces environment.
22
  ffmpeg-python
23
 
24
- # Optional: OpenCV for future enhancements (not strictly required by the current app but lightweight)
25
- opencv-python
 
 
 
21
  # FFmpeg bindings (used by moviepy/pydub). Note: the FFmpeg binary is provided by the Spaces environment.
22
  ffmpeg-python
23
 
24
+ # Pillow is used for image processing in the fallback lip‑sync implementation.
25
+ pillow
26
+
27
+ # Remove OpenCV because the fallback algorithm now uses Pillow exclusively.