Spaces:
Running
Running
Add fallback lip-sync algorithm using amplitude-driven mouth animation and update README accordingly
Browse files- app.py +31 -25
- requirements.txt +4 -2
app.py
CHANGED
|
@@ -232,13 +232,14 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
|
|
| 232 |
"""
|
| 233 |
Create a basic talking head animation without neural networks.
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
|
|
|
| 242 |
|
| 243 |
Parameters
|
| 244 |
----------
|
|
@@ -254,14 +255,15 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
|
|
| 254 |
Path
|
| 255 |
Path to the generated video file.
|
| 256 |
"""
|
| 257 |
-
import
|
| 258 |
import moviepy.editor as mpy
|
| 259 |
|
| 260 |
-
# Load avatar image (
|
| 261 |
-
|
| 262 |
-
|
|
|
|
| 263 |
raise RuntimeError("Failed to load the avatar image. Please ensure the file is a valid image.")
|
| 264 |
-
|
| 265 |
# Approximate mouth bounding box (tune proportions if necessary)
|
| 266 |
mouth_w = int(width * 0.6)
|
| 267 |
mouth_h = int(height * 0.15)
|
|
@@ -294,32 +296,36 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
|
|
| 294 |
for amp in amplitudes:
|
| 295 |
# Compute scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
|
| 296 |
factor = 1.0 + amp * 0.6
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
|
|
|
| 300 |
# Scale ROI vertically
|
| 301 |
new_h = max(1, int(mouth_h * factor))
|
| 302 |
-
scaled =
|
| 303 |
-
#
|
| 304 |
-
end_y =
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
# Use MoviePy to assemble the video and attach audio
|
| 312 |
outputs_dir = Path("outputs")
|
| 313 |
outputs_dir.mkdir(exist_ok=True)
|
| 314 |
output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
|
| 315 |
clip = mpy.ImageSequenceClip(frames, fps=fps)
|
| 316 |
-
# Attach audio
|
| 317 |
audio_clip = mpy.AudioFileClip(str(audio_path))
|
| 318 |
# Trim audio to match video length if necessary
|
| 319 |
min_duration = min(clip.duration, audio_clip.duration)
|
| 320 |
clip = clip.set_audio(audio_clip.subclip(0, min_duration))
|
| 321 |
clip = clip.set_duration(min_duration)
|
| 322 |
-
# Write out using H.264 codec and AAC audio.
|
| 323 |
clip.write_videofile(str(output_path), codec="libx264", audio_codec="aac", fps=fps, preset="ultrafast")
|
| 324 |
return output_path
|
| 325 |
|
|
|
|
| 232 |
"""
|
| 233 |
Create a basic talking head animation without neural networks.
|
| 234 |
|
| 235 |
+
This fallback implementation avoids heavy dependencies such as OpenCV by
|
| 236 |
+
relying on Pillow to manipulate the avatar image. It estimates speech
|
| 237 |
+
activity from the audio's RMS amplitude and animates the avatar by
|
| 238 |
+
vertically stretching the mouth region. Each frame is generated by
|
| 239 |
+
resizing this region using Pillow and then compiled into a video via
|
| 240 |
+
MoviePy. Because MoviePy uses a bundled FFmpeg binary via
|
| 241 |
+
``imageio-ffmpeg``, this should work even if system FFmpeg is not
|
| 242 |
+
installed.
|
| 243 |
|
| 244 |
Parameters
|
| 245 |
----------
|
|
|
|
| 255 |
Path
|
| 256 |
Path to the generated video file.
|
| 257 |
"""
|
| 258 |
+
from PIL import Image # Pillow for image manipulation
|
| 259 |
import moviepy.editor as mpy
|
| 260 |
|
| 261 |
+
# Load avatar image (RGB)
|
| 262 |
+
try:
|
| 263 |
+
img = Image.open(str(image_path)).convert("RGB")
|
| 264 |
+
except Exception:
|
| 265 |
raise RuntimeError("Failed to load the avatar image. Please ensure the file is a valid image.")
|
| 266 |
+
width, height = img.size
|
| 267 |
# Approximate mouth bounding box (tune proportions if necessary)
|
| 268 |
mouth_w = int(width * 0.6)
|
| 269 |
mouth_h = int(height * 0.15)
|
|
|
|
| 296 |
for amp in amplitudes:
|
| 297 |
# Compute scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
|
| 298 |
factor = 1.0 + amp * 0.6
|
| 299 |
+
# Start from a copy of the base image
|
| 300 |
+
frame_img = img.copy()
|
| 301 |
+
# Crop mouth region
|
| 302 |
+
roi = img.crop((mouth_x, mouth_y, mouth_x + mouth_w, mouth_y + mouth_h))
|
| 303 |
# Scale ROI vertically
|
| 304 |
new_h = max(1, int(mouth_h * factor))
|
| 305 |
+
scaled = roi.resize((mouth_w, new_h), Image.BILINEAR)
|
| 306 |
+
# Compute overlay height (do not exceed image bounds)
|
| 307 |
+
end_y = mouth_y + new_h
|
| 308 |
+
if end_y > height:
|
| 309 |
+
# Trim scaled ROI if it would overflow beyond the image bottom
|
| 310 |
+
trim_h = height - mouth_y
|
| 311 |
+
scaled = scaled.crop((0, 0, mouth_w, trim_h))
|
| 312 |
+
end_y = height
|
| 313 |
+
# Paste scaled ROI onto frame
|
| 314 |
+
frame_img.paste(scaled, (mouth_x, mouth_y))
|
| 315 |
+
# Convert to numpy array for MoviePy (RGB)
|
| 316 |
+
frames.append(np.array(frame_img))
|
| 317 |
|
| 318 |
# Use MoviePy to assemble the video and attach audio
|
| 319 |
outputs_dir = Path("outputs")
|
| 320 |
outputs_dir.mkdir(exist_ok=True)
|
| 321 |
output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
|
| 322 |
clip = mpy.ImageSequenceClip(frames, fps=fps)
|
|
|
|
| 323 |
audio_clip = mpy.AudioFileClip(str(audio_path))
|
| 324 |
# Trim audio to match video length if necessary
|
| 325 |
min_duration = min(clip.duration, audio_clip.duration)
|
| 326 |
clip = clip.set_audio(audio_clip.subclip(0, min_duration))
|
| 327 |
clip = clip.set_duration(min_duration)
|
| 328 |
+
# Write out using H.264 codec and AAC audio. MoviePy will use imageio-ffmpeg's bundled FFmpeg.
|
| 329 |
clip.write_videofile(str(output_path), codec="libx264", audio_codec="aac", fps=fps, preset="ultrafast")
|
| 330 |
return output_path
|
| 331 |
|
requirements.txt
CHANGED
|
@@ -21,5 +21,7 @@ tqdm
|
|
| 21 |
# FFmpeg bindings (used by moviepy/pydub). Note: the FFmpeg binary is provided by the Spaces environment.
|
| 22 |
ffmpeg-python
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
|
|
|
|
|
|
|
|
|
| 21 |
# FFmpeg bindings (used by moviepy/pydub). Note: the FFmpeg binary is provided by the Spaces environment.
|
| 22 |
ffmpeg-python
|
| 23 |
|
| 24 |
+
# Pillow is used for image processing in the fallback lip‑sync implementation.
|
| 25 |
+
pillow
|
| 26 |
+
|
| 27 |
+
# Remove OpenCV because the fallback algorithm now uses Pillow exclusively.
|