Spaces:
Sleeping
Sleeping
| """ | |
| Lip-Sync Animation Module | |
| ========================= | |
| Generates animated GIFs with lip-sync based on audio amplitude. | |
| Functions: | |
| - audio_to_rms_chunks: Extract amplitude data from audio | |
| - generate_lipsync_gif: Create lip-sync animation GIF | |
| """ | |
| from PIL import Image | |
| import imageio | |
| from pathlib import Path | |
| import time | |
| from typing import List, Optional | |
| import os | |
| def audio_to_rms_chunks(audio_path: str, chunk_ms: int = 80) -> List[float]: | |
| """ | |
| Extract RMS (Root Mean Square) amplitude values from audio. | |
| Splits audio into chunks and calculates the RMS value for each, | |
| which represents the "loudness" of that segment. | |
| Args: | |
| audio_path: Path to the audio file (MP3) | |
| chunk_ms: Duration of each chunk in milliseconds | |
| Returns: | |
| List of RMS values, one per chunk | |
| """ | |
| try: | |
| from pydub import AudioSegment | |
| from pydub.utils import make_chunks | |
| # Check if file exists | |
| if not os.path.exists(audio_path): | |
| raise FileNotFoundError(f"Audio file not found: {audio_path}") | |
| # Load audio file | |
| audio = AudioSegment.from_file(audio_path) | |
| # Split into chunks | |
| chunks = make_chunks(audio, chunk_ms) | |
| # Calculate RMS for each chunk | |
| rms_values = [chunk.rms for chunk in chunks if len(chunk) > 0] | |
| return rms_values if rms_values else [0] | |
| except Exception as e: | |
| print(f"Error processing audio: {e}") | |
| # Return default values if audio processing fails | |
| return [100, 200, 150, 300, 250, 100, 200, 150] # Fallback animation | |
| def audio_to_rms_chunks_simple(audio_path: str, chunk_ms: int = 80) -> List[float]: | |
| """ | |
| Simple fallback method to generate fake RMS values based on file size. | |
| Used when pydub/ffmpeg fails. | |
| Args: | |
| audio_path: Path to the audio file | |
| chunk_ms: Duration of each chunk in milliseconds | |
| Returns: | |
| List of simulated RMS values | |
| """ | |
| import math | |
| try: | |
| # Estimate duration based on file size (rough approximation) | |
| file_size = os.path.getsize(audio_path) | |
| # Approximate: MP3 at 128kbps = 16KB per second | |
| estimated_duration_sec = file_size / 16000 | |
| # Calculate number of chunks | |
| num_chunks = max(int(estimated_duration_sec * 1000 / chunk_ms), 10) | |
| # Generate wave-like RMS values for natural-looking lip sync | |
| rms_values = [] | |
| for i in range(num_chunks): | |
| # Create a wave pattern | |
| value = 150 + 100 * math.sin(i * 0.5) + 50 * math.sin(i * 1.2) | |
| rms_values.append(max(50, value)) | |
| return rms_values | |
| except Exception: | |
| return [100, 200, 150, 300, 250, 100, 200, 150, 100, 200] | |
| def generate_lipsync_gif( | |
| avatar_name: str, | |
| audio_path: str, | |
| avatars_dir: Path, | |
| output_dir: Path, | |
| fps: int = 12, | |
| output_path: Optional[str] = None | |
| ) -> str: | |
| """ | |
| Generate a lip-sync animated GIF from avatar images and audio. | |
| The animation works by: | |
| 1. Analyzing audio amplitude (RMS) over time | |
| 2. Selecting mouth frame based on amplitude level | |
| 3. Compositing mouth frame onto base avatar image | |
| 4. Combining all frames into an animated GIF | |
| Args: | |
| avatar_name: Name of avatar folder (e.g., 'sample') | |
| audio_path: Path to the audio file to sync with | |
| avatars_dir: Base directory containing avatar folders | |
| output_dir: Directory to save the output GIF | |
| fps: Frames per second for the animation | |
| output_path: Optional custom output path | |
| Returns: | |
| Path to the generated GIF file | |
| Raises: | |
| FileNotFoundError: If avatar base.png or mouth frames not found | |
| """ | |
| # Locate avatar folder and files | |
| avatar_folder = avatars_dir / avatar_name | |
| base_path = avatar_folder / "base.png" | |
| mouth_frames_paths = sorted(avatar_folder.glob("mouth_*.png")) | |
| # Validate avatar files exist | |
| if not base_path.exists(): | |
| raise FileNotFoundError(f"Base image not found: {base_path}") | |
| if not mouth_frames_paths: | |
| raise FileNotFoundError(f"No mouth frames found in: {avatar_folder}") | |
| # Load base image (the avatar face) | |
| base_image = Image.open(base_path).convert("RGBA") | |
| size = base_image.size | |
| # Load all mouth frame images | |
| mouth_frames = [ | |
| Image.open(path).convert("RGBA").resize(size) | |
| for path in mouth_frames_paths | |
| ] | |
| # Calculate chunk duration to match target FPS | |
| chunk_ms = int(1000 / fps) | |
| # Try to extract audio amplitude data | |
| try: | |
| rms_values = audio_to_rms_chunks(audio_path, chunk_ms=chunk_ms) | |
| except Exception as e: | |
| print(f"Primary audio processing failed: {e}") | |
| print("Using fallback animation method...") | |
| rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms) | |
| # Handle edge case of empty or invalid audio | |
| if not rms_values or all(v == 0 for v in rms_values): | |
| rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms) | |
| # Normalize RMS values to 0-1 range | |
| max_rms = max(rms_values) if max(rms_values) > 0 else 1 | |
| # Generate animation frames | |
| frames = [] | |
| num_mouth_frames = len(mouth_frames) | |
| for rms in rms_values: | |
| # Calculate mouth openness ratio (0 to 1) | |
| ratio = rms / max_rms | |
| # Map ratio to mouth frame index | |
| mouth_index = int(ratio * (num_mouth_frames - 1)) | |
| mouth_index = max(0, min(mouth_index, num_mouth_frames - 1)) | |
| # Composite mouth onto base image | |
| mouth = mouth_frames[mouth_index] | |
| frame = Image.alpha_composite(base_image, mouth) | |
| # Convert to RGB for GIF compatibility | |
| frame_rgb = Image.new("RGB", frame.size, (255, 255, 255)) | |
| frame_rgb.paste(frame, mask=frame.split()[-1] if frame.mode == 'RGBA' else None) | |
| frames.append(frame_rgb) | |
| # Ensure output directory exists | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Generate output filename | |
| if output_path is None: | |
| timestamp = int(time.time() * 1000) | |
| output_path = str(output_dir / f"lipsync_{timestamp}.gif") | |
| # Save as animated GIF | |
| if frames: | |
| # Use imageio to save GIF | |
| imageio.mimsave( | |
| output_path, | |
| frames, | |
| fps=fps, | |
| loop=0 # Loop forever | |
| ) | |
| else: | |
| raise ValueError("No frames generated for animation") | |
| return output_path |