Translator / utils /lipsync.py
nihun's picture
Upload 19 files
85c18a5 verified
"""
Lip-Sync Animation Module
=========================
Generates animated GIFs with lip-sync based on audio amplitude.
Functions:
- audio_to_rms_chunks: Extract amplitude data from audio
- generate_lipsync_gif: Create lip-sync animation GIF
"""
from PIL import Image
import imageio
from pathlib import Path
import time
from typing import List, Optional
import os
def audio_to_rms_chunks(audio_path: str, chunk_ms: int = 80) -> List[float]:
"""
Extract RMS (Root Mean Square) amplitude values from audio.
Splits audio into chunks and calculates the RMS value for each,
which represents the "loudness" of that segment.
Args:
audio_path: Path to the audio file (MP3)
chunk_ms: Duration of each chunk in milliseconds
Returns:
List of RMS values, one per chunk
"""
try:
from pydub import AudioSegment
from pydub.utils import make_chunks
# Check if file exists
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
# Load audio file
audio = AudioSegment.from_file(audio_path)
# Split into chunks
chunks = make_chunks(audio, chunk_ms)
# Calculate RMS for each chunk
rms_values = [chunk.rms for chunk in chunks if len(chunk) > 0]
return rms_values if rms_values else [0]
except Exception as e:
print(f"Error processing audio: {e}")
# Return default values if audio processing fails
return [100, 200, 150, 300, 250, 100, 200, 150] # Fallback animation
def audio_to_rms_chunks_simple(audio_path: str, chunk_ms: int = 80) -> List[float]:
"""
Simple fallback method to generate fake RMS values based on file size.
Used when pydub/ffmpeg fails.
Args:
audio_path: Path to the audio file
chunk_ms: Duration of each chunk in milliseconds
Returns:
List of simulated RMS values
"""
import math
try:
# Estimate duration based on file size (rough approximation)
file_size = os.path.getsize(audio_path)
# Approximate: MP3 at 128kbps = 16KB per second
estimated_duration_sec = file_size / 16000
# Calculate number of chunks
num_chunks = max(int(estimated_duration_sec * 1000 / chunk_ms), 10)
# Generate wave-like RMS values for natural-looking lip sync
rms_values = []
for i in range(num_chunks):
# Create a wave pattern
value = 150 + 100 * math.sin(i * 0.5) + 50 * math.sin(i * 1.2)
rms_values.append(max(50, value))
return rms_values
except Exception:
return [100, 200, 150, 300, 250, 100, 200, 150, 100, 200]
def generate_lipsync_gif(
avatar_name: str,
audio_path: str,
avatars_dir: Path,
output_dir: Path,
fps: int = 12,
output_path: Optional[str] = None
) -> str:
"""
Generate a lip-sync animated GIF from avatar images and audio.
The animation works by:
1. Analyzing audio amplitude (RMS) over time
2. Selecting mouth frame based on amplitude level
3. Compositing mouth frame onto base avatar image
4. Combining all frames into an animated GIF
Args:
avatar_name: Name of avatar folder (e.g., 'sample')
audio_path: Path to the audio file to sync with
avatars_dir: Base directory containing avatar folders
output_dir: Directory to save the output GIF
fps: Frames per second for the animation
output_path: Optional custom output path
Returns:
Path to the generated GIF file
Raises:
FileNotFoundError: If avatar base.png or mouth frames not found
"""
# Locate avatar folder and files
avatar_folder = avatars_dir / avatar_name
base_path = avatar_folder / "base.png"
mouth_frames_paths = sorted(avatar_folder.glob("mouth_*.png"))
# Validate avatar files exist
if not base_path.exists():
raise FileNotFoundError(f"Base image not found: {base_path}")
if not mouth_frames_paths:
raise FileNotFoundError(f"No mouth frames found in: {avatar_folder}")
# Load base image (the avatar face)
base_image = Image.open(base_path).convert("RGBA")
size = base_image.size
# Load all mouth frame images
mouth_frames = [
Image.open(path).convert("RGBA").resize(size)
for path in mouth_frames_paths
]
# Calculate chunk duration to match target FPS
chunk_ms = int(1000 / fps)
# Try to extract audio amplitude data
try:
rms_values = audio_to_rms_chunks(audio_path, chunk_ms=chunk_ms)
except Exception as e:
print(f"Primary audio processing failed: {e}")
print("Using fallback animation method...")
rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms)
# Handle edge case of empty or invalid audio
if not rms_values or all(v == 0 for v in rms_values):
rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms)
# Normalize RMS values to 0-1 range
max_rms = max(rms_values) if max(rms_values) > 0 else 1
# Generate animation frames
frames = []
num_mouth_frames = len(mouth_frames)
for rms in rms_values:
# Calculate mouth openness ratio (0 to 1)
ratio = rms / max_rms
# Map ratio to mouth frame index
mouth_index = int(ratio * (num_mouth_frames - 1))
mouth_index = max(0, min(mouth_index, num_mouth_frames - 1))
# Composite mouth onto base image
mouth = mouth_frames[mouth_index]
frame = Image.alpha_composite(base_image, mouth)
# Convert to RGB for GIF compatibility
frame_rgb = Image.new("RGB", frame.size, (255, 255, 255))
frame_rgb.paste(frame, mask=frame.split()[-1] if frame.mode == 'RGBA' else None)
frames.append(frame_rgb)
# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)
# Generate output filename
if output_path is None:
timestamp = int(time.time() * 1000)
output_path = str(output_dir / f"lipsync_{timestamp}.gif")
# Save as animated GIF
if frames:
# Use imageio to save GIF
imageio.mimsave(
output_path,
frames,
fps=fps,
loop=0 # Loop forever
)
else:
raise ValueError("No frames generated for animation")
return output_path