""" Lip-Sync Animation Module ========================= Generates animated GIFs with lip-sync based on audio amplitude. Functions: - audio_to_rms_chunks: Extract amplitude data from audio - generate_lipsync_gif: Create lip-sync animation GIF """ from PIL import Image import imageio from pathlib import Path import time from typing import List, Optional import os def audio_to_rms_chunks(audio_path: str, chunk_ms: int = 80) -> List[float]: """ Extract RMS (Root Mean Square) amplitude values from audio. Splits audio into chunks and calculates the RMS value for each, which represents the "loudness" of that segment. Args: audio_path: Path to the audio file (MP3) chunk_ms: Duration of each chunk in milliseconds Returns: List of RMS values, one per chunk """ try: from pydub import AudioSegment from pydub.utils import make_chunks # Check if file exists if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") # Load audio file audio = AudioSegment.from_file(audio_path) # Split into chunks chunks = make_chunks(audio, chunk_ms) # Calculate RMS for each chunk rms_values = [chunk.rms for chunk in chunks if len(chunk) > 0] return rms_values if rms_values else [0] except Exception as e: print(f"Error processing audio: {e}") # Return default values if audio processing fails return [100, 200, 150, 300, 250, 100, 200, 150] # Fallback animation def audio_to_rms_chunks_simple(audio_path: str, chunk_ms: int = 80) -> List[float]: """ Simple fallback method to generate fake RMS values based on file size. Used when pydub/ffmpeg fails. Args: audio_path: Path to the audio file chunk_ms: Duration of each chunk in milliseconds Returns: List of simulated RMS values """ import math try: # Estimate duration based on file size (rough approximation) file_size = os.path.getsize(audio_path) # Approximate: MP3 at 128kbps = 16KB per second estimated_duration_sec = file_size / 16000 # Calculate number of chunks num_chunks = max(int(estimated_duration_sec * 1000 / chunk_ms), 10) # Generate wave-like RMS values for natural-looking lip sync rms_values = [] for i in range(num_chunks): # Create a wave pattern value = 150 + 100 * math.sin(i * 0.5) + 50 * math.sin(i * 1.2) rms_values.append(max(50, value)) return rms_values except Exception: return [100, 200, 150, 300, 250, 100, 200, 150, 100, 200] def generate_lipsync_gif( avatar_name: str, audio_path: str, avatars_dir: Path, output_dir: Path, fps: int = 12, output_path: Optional[str] = None ) -> str: """ Generate a lip-sync animated GIF from avatar images and audio. The animation works by: 1. Analyzing audio amplitude (RMS) over time 2. Selecting mouth frame based on amplitude level 3. Compositing mouth frame onto base avatar image 4. Combining all frames into an animated GIF Args: avatar_name: Name of avatar folder (e.g., 'sample') audio_path: Path to the audio file to sync with avatars_dir: Base directory containing avatar folders output_dir: Directory to save the output GIF fps: Frames per second for the animation output_path: Optional custom output path Returns: Path to the generated GIF file Raises: FileNotFoundError: If avatar base.png or mouth frames not found """ # Locate avatar folder and files avatar_folder = avatars_dir / avatar_name base_path = avatar_folder / "base.png" mouth_frames_paths = sorted(avatar_folder.glob("mouth_*.png")) # Validate avatar files exist if not base_path.exists(): raise FileNotFoundError(f"Base image not found: {base_path}") if not mouth_frames_paths: raise FileNotFoundError(f"No mouth frames found in: {avatar_folder}") # Load base image (the avatar face) base_image = Image.open(base_path).convert("RGBA") size = base_image.size # Load all mouth frame images mouth_frames = [ Image.open(path).convert("RGBA").resize(size) for path in mouth_frames_paths ] # Calculate chunk duration to match target FPS chunk_ms = int(1000 / fps) # Try to extract audio amplitude data try: rms_values = audio_to_rms_chunks(audio_path, chunk_ms=chunk_ms) except Exception as e: print(f"Primary audio processing failed: {e}") print("Using fallback animation method...") rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms) # Handle edge case of empty or invalid audio if not rms_values or all(v == 0 for v in rms_values): rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms) # Normalize RMS values to 0-1 range max_rms = max(rms_values) if max(rms_values) > 0 else 1 # Generate animation frames frames = [] num_mouth_frames = len(mouth_frames) for rms in rms_values: # Calculate mouth openness ratio (0 to 1) ratio = rms / max_rms # Map ratio to mouth frame index mouth_index = int(ratio * (num_mouth_frames - 1)) mouth_index = max(0, min(mouth_index, num_mouth_frames - 1)) # Composite mouth onto base image mouth = mouth_frames[mouth_index] frame = Image.alpha_composite(base_image, mouth) # Convert to RGB for GIF compatibility frame_rgb = Image.new("RGB", frame.size, (255, 255, 255)) frame_rgb.paste(frame, mask=frame.split()[-1] if frame.mode == 'RGBA' else None) frames.append(frame_rgb) # Ensure output directory exists output_dir.mkdir(parents=True, exist_ok=True) # Generate output filename if output_path is None: timestamp = int(time.time() * 1000) output_path = str(output_dir / f"lipsync_{timestamp}.gif") # Save as animated GIF if frames: # Use imageio to save GIF imageio.mimsave( output_path, frames, fps=fps, loop=0 # Loop forever ) else: raise ValueError("No frames generated for animation") return output_path