gyrus2 commited on
Commit
be38427
·
verified ·
1 Parent(s): df17517

Use system ffmpeg for fallback lip-sync; generate frames and encode via ffmpeg

Browse files
Files changed (1) hide show
  1. app.py +64 -42
app.py CHANGED
@@ -232,14 +232,14 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
232
  """
233
  Create a basic talking head animation without neural networks.
234
 
235
- This fallback implementation avoids heavy dependencies such as OpenCV by
236
- relying on Pillow to manipulate the avatar image. It estimates speech
237
- activity from the audio's RMS amplitude and animates the avatar by
238
- vertically stretching the mouth region. Each frame is generated by
239
- resizing this region using Pillow and then compiled into a video via
240
- MoviePy. Because MoviePy uses a bundled FFmpeg binary via
241
- ``imageio-ffmpeg``, this should work even if system FFmpeg is not
242
- installed.
243
 
244
  Parameters
245
  ----------
@@ -256,7 +256,6 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
256
  Path to the generated video file.
257
  """
258
  from PIL import Image # Pillow for image manipulation
259
- import moviepy.editor as mpy
260
 
261
  # Load avatar image (RGB)
262
  try:
@@ -278,7 +277,7 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
278
  samples = samples.reshape((-1, audio.channels)).mean(axis=1)
279
  frame_size = int(audio.frame_rate / fps)
280
  n_frames = max(int(len(samples) / frame_size), 1)
281
- amplitudes = []
282
  for i in range(n_frames):
283
  segment = samples[i * frame_size : (i + 1) * frame_size]
284
  if segment.size == 0:
@@ -290,43 +289,66 @@ def simple_lip_sync(image_path: Path, audio_path: Path, fps: int = 25) -> Path:
290
  max_amp = max(amplitudes) if amplitudes else 1.0
291
  if max_amp == 0:
292
  max_amp = 1.0
 
293
  amplitudes = [amp / max_amp for amp in amplitudes]
294
 
295
- frames = []
296
- for amp in amplitudes:
297
- # Compute scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
298
- factor = 1.0 + amp * 0.6
299
- # Start from a copy of the base image
300
- frame_img = img.copy()
301
- # Crop mouth region
302
- roi = img.crop((mouth_x, mouth_y, mouth_x + mouth_w, mouth_y + mouth_h))
303
- # Scale ROI vertically
304
- new_h = max(1, int(mouth_h * factor))
305
- scaled = roi.resize((mouth_w, new_h), Image.BILINEAR)
306
- # Compute overlay height (do not exceed image bounds)
307
- end_y = mouth_y + new_h
308
- if end_y > height:
309
- # Trim scaled ROI if it would overflow beyond the image bottom
310
- trim_h = height - mouth_y
311
- scaled = scaled.crop((0, 0, mouth_w, trim_h))
312
- end_y = height
313
- # Paste scaled ROI onto frame
314
- frame_img.paste(scaled, (mouth_x, mouth_y))
315
- # Convert to numpy array for MoviePy (RGB)
316
- frames.append(np.array(frame_img))
317
-
318
- # Use MoviePy to assemble the video and attach audio
319
  outputs_dir = Path("outputs")
320
  outputs_dir.mkdir(exist_ok=True)
321
  output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
322
- clip = mpy.ImageSequenceClip(frames, fps=fps)
323
- audio_clip = mpy.AudioFileClip(str(audio_path))
324
- # Trim audio to match video length if necessary
325
- min_duration = min(clip.duration, audio_clip.duration)
326
- clip = clip.set_audio(audio_clip.subclip(0, min_duration))
327
- clip = clip.set_duration(min_duration)
328
- # Write out using H.264 codec and AAC audio. MoviePy will use imageio-ffmpeg's bundled FFmpeg.
329
- clip.write_videofile(str(output_path), codec="libx264", audio_codec="aac", fps=fps, preset="ultrafast")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  return output_path
331
 
332
 
 
232
  """
233
  Create a basic talking head animation without neural networks.
234
 
235
+ This fallback implementation estimates speech activity from the audio's
236
+ root‑mean‑square (RMS) amplitude and stretches the mouth region of the
237
+ avatar image accordingly. Frames are saved to a temporary directory and
238
+ then stitched together with the original audio via the system ``ffmpeg``
239
+ binary. This avoids heavy Python dependencies (like OpenCV and
240
+ MoviePy) and works in network‑restricted environments as long as
241
+ ``ffmpeg`` is available (it is installed by default on Hugging Face
242
+ Spaces CPU images).
243
 
244
  Parameters
245
  ----------
 
256
  Path to the generated video file.
257
  """
258
  from PIL import Image # Pillow for image manipulation
 
259
 
260
  # Load avatar image (RGB)
261
  try:
 
277
  samples = samples.reshape((-1, audio.channels)).mean(axis=1)
278
  frame_size = int(audio.frame_rate / fps)
279
  n_frames = max(int(len(samples) / frame_size), 1)
280
+ amplitudes: list[float] = []
281
  for i in range(n_frames):
282
  segment = samples[i * frame_size : (i + 1) * frame_size]
283
  if segment.size == 0:
 
289
  max_amp = max(amplitudes) if amplitudes else 1.0
290
  if max_amp == 0:
291
  max_amp = 1.0
292
+ # Normalise amplitudes to [0, 1]
293
  amplitudes = [amp / max_amp for amp in amplitudes]
294
 
295
+ # Prepare output paths
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  outputs_dir = Path("outputs")
297
  outputs_dir.mkdir(exist_ok=True)
298
  output_path = outputs_dir / f"simple_{image_path.stem}.mp4"
299
+
300
+ # Create temporary directory for frames
301
+ with tempfile.TemporaryDirectory() as tmpdir:
302
+ frames_dir = Path(tmpdir)
303
+ # Generate each frame
304
+ for idx, amp in enumerate(amplitudes):
305
+ # Scaling factor between 1.0 (mouth closed) and 1.6 (fully open)
306
+ factor = 1.0 + amp * 0.6
307
+ # Start from a copy of the base image
308
+ frame_img = img.copy()
309
+ # Crop mouth region from the base image
310
+ roi = img.crop((mouth_x, mouth_y, mouth_x + mouth_w, mouth_y + mouth_h))
311
+ # Scale ROI vertically
312
+ new_h = max(1, int(mouth_h * factor))
313
+ scaled = roi.resize((mouth_w, new_h), Image.BILINEAR)
314
+ # Compute overlay height (do not exceed image bounds)
315
+ end_y = mouth_y + new_h
316
+ if end_y > height:
317
+ # Trim scaled ROI if it would overflow beyond the image bottom
318
+ trim_h = height - mouth_y
319
+ scaled = scaled.crop((0, 0, mouth_w, trim_h))
320
+ end_y = height
321
+ # Paste scaled ROI onto frame
322
+ frame_img.paste(scaled, (mouth_x, mouth_y))
323
+ # Save frame as PNG
324
+ frame_filename = frames_dir / f"frame_{idx:04d}.png"
325
+ frame_img.save(frame_filename)
326
+
327
+ # Assemble video using ffmpeg. The -shortest flag ensures that the
328
+ # output ends when the shorter of the audio or video streams ends. Use
329
+ # -loglevel error to suppress verbose output.
330
+ cmd = [
331
+ "ffmpeg",
332
+ "-y", # overwrite existing file
333
+ "-loglevel", "error",
334
+ "-framerate", str(fps),
335
+ "-i", str(frames_dir / "frame_%04d.png"),
336
+ "-i", str(audio_path),
337
+ "-c:v", "libx264",
338
+ "-pix_fmt", "yuv420p",
339
+ "-c:a", "aac",
340
+ "-shortest",
341
+ str(output_path),
342
+ ]
343
+ try:
344
+ subprocess.run(cmd, check=True)
345
+ except Exception as e:
346
+ # If ffmpeg fails (e.g. missing binary), raise a user‑visible error
347
+ raise RuntimeError(
348
+ f"Failed to assemble video with ffmpeg: {e}. "
349
+ "Ensure that the ffmpeg binary is available in the environment."
350
+ )
351
+
352
  return output_path
353
 
354