Spaces:
Running
Running
Use system ffmpeg for fallback lip-sync; generate frames and encode via ffmpeg
Browse files
app.py
CHANGED
|
@@ -232,14 +232,14 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
|
|
| 232 |
"""
|
| 233 |
Create a basic talking head animation without neural networks.
|
| 234 |
|
| 235 |
-
This fallback implementation
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
MoviePy
|
| 241 |
-
``
|
| 242 |
-
|
| 243 |
|
| 244 |
Parameters
|
| 245 |
----------
|
|
@@ -256,7 +256,6 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
|
|
| 256 |
Path to the generated video file.
|
| 257 |
"""
|
| 258 |
from PIL import Image # Pillow for image manipulation
|
| 259 |
-
import moviepy.editor as mpy
|
| 260 |
|
| 261 |
# Load avatar image (RGB)
|
| 262 |
try:
|
|
@@ -278,7 +277,7 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
|
|
| 278 |
samples = samples.reshape((-1, audio.channels)).mean(axis=1)
|
| 279 |
frame_size = int(audio.frame_rate / fps)
|
| 280 |
n_frames = max(int(len(samples) / frame_size), 1)
|
| 281 |
-
amplitudes = []
|
| 282 |
for i in range(n_frames):
|
| 283 |
segment = samples[i * frame_size : (i + 1) * frame_size]
|
| 284 |
if segment.size == 0:
|
|
@@ -290,43 +289,66 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
|
|
| 290 |
max_amp = max(amplitudes) if amplitudes else 1.0
|
| 291 |
if max_amp == 0:
|
| 292 |
max_amp = 1.0
|
|
|
|
| 293 |
amplitudes = [amp / max_amp for amp in amplitudes]
|
| 294 |
|
| 295 |
-
|
| 296 |
-
for amp in amplitudes:
|
| 297 |
-
# Compute scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
|
| 298 |
-
factor = 1.0 + amp * 0.6
|
| 299 |
-
# Start from a copy of the base image
|
| 300 |
-
frame_img = img.copy()
|
| 301 |
-
# Crop mouth region
|
| 302 |
-
roi = img.crop((mouth_x, mouth_y, mouth_x + mouth_w, mouth_y + mouth_h))
|
| 303 |
-
# Scale ROI vertically
|
| 304 |
-
new_h = max(1, int(mouth_h * factor))
|
| 305 |
-
scaled = roi.resize((mouth_w, new_h), Image.BILINEAR)
|
| 306 |
-
# Compute overlay height (do not exceed image bounds)
|
| 307 |
-
end_y = mouth_y + new_h
|
| 308 |
-
if end_y > height:
|
| 309 |
-
# Trim scaled ROI if it would overflow beyond the image bottom
|
| 310 |
-
trim_h = height - mouth_y
|
| 311 |
-
scaled = scaled.crop((0, 0, mouth_w, trim_h))
|
| 312 |
-
end_y = height
|
| 313 |
-
# Paste scaled ROI onto frame
|
| 314 |
-
frame_img.paste(scaled, (mouth_x, mouth_y))
|
| 315 |
-
# Convert to numpy array for MoviePy (RGB)
|
| 316 |
-
frames.append(np.array(frame_img))
|
| 317 |
-
|
| 318 |
-
# Use MoviePy to assemble the video and attach audio
|
| 319 |
outputs_dir = Path("outputs")
|
| 320 |
outputs_dir.mkdir(exist_ok=True)
|
| 321 |
output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
return output_path
|
| 331 |
|
| 332 |
|
|
|
|
| 232 |
"""
|
| 233 |
Create a basic talking head animation without neural networks.
|
| 234 |
|
| 235 |
+
This fallback implementation estimates speech activity from the audio's
|
| 236 |
+
root‑mean‑square (RMS) amplitude and stretches the mouth region of the
|
| 237 |
+
avatar image accordingly. Frames are saved to a temporary directory and
|
| 238 |
+
then stitched together with the original audio via the system ``ffmpeg``
|
| 239 |
+
binary. This avoids heavy Python dependencies (like OpenCV and
|
| 240 |
+
MoviePy) and works in network‑restricted environments as long as
|
| 241 |
+
``ffmpeg`` is available (it is installed by default on Hugging Face
|
| 242 |
+
Spaces CPU images).
|
| 243 |
|
| 244 |
Parameters
|
| 245 |
----------
|
|
|
|
| 256 |
Path to the generated video file.
|
| 257 |
"""
|
| 258 |
from PIL import Image # Pillow for image manipulation
|
|
|
|
| 259 |
|
| 260 |
# Load avatar image (RGB)
|
| 261 |
try:
|
|
|
|
| 277 |
samples = samples.reshape((-1, audio.channels)).mean(axis=1)
|
| 278 |
frame_size = int(audio.frame_rate / fps)
|
| 279 |
n_frames = max(int(len(samples) / frame_size), 1)
|
| 280 |
+
amplitudes: list[float] = []
|
| 281 |
for i in range(n_frames):
|
| 282 |
segment = samples[i * frame_size : (i + 1) * frame_size]
|
| 283 |
if segment.size == 0:
|
|
|
|
| 289 |
max_amp = max(amplitudes) if amplitudes else 1.0
|
| 290 |
if max_amp == 0:
|
| 291 |
max_amp = 1.0
|
| 292 |
+
# Normalise amplitudes to [0, 1]
|
| 293 |
amplitudes = [amp / max_amp for amp in amplitudes]
|
| 294 |
|
| 295 |
+
# Prepare output paths
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
outputs_dir = Path("outputs")
|
| 297 |
outputs_dir.mkdir(exist_ok=True)
|
| 298 |
output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
|
| 299 |
+
|
| 300 |
+
# Create temporary directory for frames
|
| 301 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 302 |
+
frames_dir = Path(tmpdir)
|
| 303 |
+
# Generate each frame
|
| 304 |
+
for idx, amp in enumerate(amplitudes):
|
| 305 |
+
# Scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
|
| 306 |
+
factor = 1.0 + amp * 0.6
|
| 307 |
+
# Start from a copy of the base image
|
| 308 |
+
frame_img = img.copy()
|
| 309 |
+
# Crop mouth region from the base image
|
| 310 |
+
roi = img.crop((mouth_x, mouth_y, mouth_x + mouth_w, mouth_y + mouth_h))
|
| 311 |
+
# Scale ROI vertically
|
| 312 |
+
new_h = max(1, int(mouth_h * factor))
|
| 313 |
+
scaled = roi.resize((mouth_w, new_h), Image.BILINEAR)
|
| 314 |
+
# Compute overlay height (do not exceed image bounds)
|
| 315 |
+
end_y = mouth_y + new_h
|
| 316 |
+
if end_y > height:
|
| 317 |
+
# Trim scaled ROI if it would overflow beyond the image bottom
|
| 318 |
+
trim_h = height - mouth_y
|
| 319 |
+
scaled = scaled.crop((0, 0, mouth_w, trim_h))
|
| 320 |
+
end_y = height
|
| 321 |
+
# Paste scaled ROI onto frame
|
| 322 |
+
frame_img.paste(scaled, (mouth_x, mouth_y))
|
| 323 |
+
# Save frame as PNG
|
| 324 |
+
frame_filename = frames_dir / f"frame_{idx:04d}.png"
|
| 325 |
+
frame_img.save(frame_filename)
|
| 326 |
+
|
| 327 |
+
# Assemble video using ffmpeg. The -shortest flag ensures that the
|
| 328 |
+
# output ends when the shorter of the audio or video streams ends. Use
|
| 329 |
+
# -loglevel error to suppress verbose output.
|
| 330 |
+
cmd = [
|
| 331 |
+
"ffmpeg",
|
| 332 |
+
"-y", # overwrite existing file
|
| 333 |
+
"-loglevel", "error",
|
| 334 |
+
"-framerate", str(fps),
|
| 335 |
+
"-i", str(frames_dir / "frame_%04d.png"),
|
| 336 |
+
"-i", str(audio_path),
|
| 337 |
+
"-c:v", "libx264",
|
| 338 |
+
"-pix_fmt", "yuv420p",
|
| 339 |
+
"-c:a", "aac",
|
| 340 |
+
"-shortest",
|
| 341 |
+
str(output_path),
|
| 342 |
+
]
|
| 343 |
+
try:
|
| 344 |
+
subprocess.run(cmd, check=True)
|
| 345 |
+
except Exception as e:
|
| 346 |
+
# If ffmpeg fails (e.g. missing binary), raise a user‑visible error
|
| 347 |
+
raise RuntimeError(
|
| 348 |
+
f"Failed to assemble video with ffmpeg: {e}. "
|
| 349 |
+
"Ensure that the ffmpeg binary is available in the environment."
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
return output_path
|
| 353 |
|
| 354 |
|