Spaces:

nihun
/

Translator

Sleeping

App Files Files Community

Translator / utils /lipsync.py

nihun

Upload 19 files

85c18a5 verified 2 months ago

raw

history blame contribute delete

6.89 kB

	"""
	Lip-Sync Animation Module
	=========================
	Generates animated GIFs with lip-sync based on audio amplitude.

	Functions:
	- audio_to_rms_chunks: Extract amplitude data from audio
	- generate_lipsync_gif: Create lip-sync animation GIF
	"""

	from PIL import Image
	import imageio
	from pathlib import Path
	import time
	from typing import List, Optional
	import os


	def audio_to_rms_chunks(audio_path: str, chunk_ms: int = 80) -> List[float]:
	"""
	Extract RMS (Root Mean Square) amplitude values from audio.

	Splits audio into chunks and calculates the RMS value for each,
	which represents the "loudness" of that segment.

	Args:
	audio_path: Path to the audio file (MP3)
	chunk_ms: Duration of each chunk in milliseconds

	Returns:
	List of RMS values, one per chunk
	"""
	try:
	from pydub import AudioSegment
	from pydub.utils import make_chunks

	# Check if file exists
	if not os.path.exists(audio_path):
	raise FileNotFoundError(f"Audio file not found: {audio_path}")

	# Load audio file
	audio = AudioSegment.from_file(audio_path)

	# Split into chunks
	chunks = make_chunks(audio, chunk_ms)

	# Calculate RMS for each chunk
	rms_values = [chunk.rms for chunk in chunks if len(chunk) > 0]

	return rms_values if rms_values else [0]

	except Exception as e:
	print(f"Error processing audio: {e}")
	# Return default values if audio processing fails
	return [100, 200, 150, 300, 250, 100, 200, 150] # Fallback animation


	def audio_to_rms_chunks_simple(audio_path: str, chunk_ms: int = 80) -> List[float]:
	"""
	Simple fallback method to generate fake RMS values based on file size.
	Used when pydub/ffmpeg fails.

	Args:
	audio_path: Path to the audio file
	chunk_ms: Duration of each chunk in milliseconds

	Returns:
	List of simulated RMS values
	"""
	import math

	try:
	# Estimate duration based on file size (rough approximation)
	file_size = os.path.getsize(audio_path)

	# Approximate: MP3 at 128kbps = 16KB per second
	estimated_duration_sec = file_size / 16000

	# Calculate number of chunks
	num_chunks = max(int(estimated_duration_sec * 1000 / chunk_ms), 10)

	# Generate wave-like RMS values for natural-looking lip sync
	rms_values = []
	for i in range(num_chunks):
	# Create a wave pattern
	value = 150 + 100 * math.sin(i * 0.5) + 50 * math.sin(i * 1.2)
	rms_values.append(max(50, value))

	return rms_values

	except Exception:
	return [100, 200, 150, 300, 250, 100, 200, 150, 100, 200]


	def generate_lipsync_gif(
	avatar_name: str,
	audio_path: str,
	avatars_dir: Path,
	output_dir: Path,
	fps: int = 12,
	output_path: Optional[str] = None
	) -> str:
	"""
	Generate a lip-sync animated GIF from avatar images and audio.

	The animation works by:
	1. Analyzing audio amplitude (RMS) over time
	2. Selecting mouth frame based on amplitude level
	3. Compositing mouth frame onto base avatar image
	4. Combining all frames into an animated GIF

	Args:
	avatar_name: Name of avatar folder (e.g., 'sample')
	audio_path: Path to the audio file to sync with
	avatars_dir: Base directory containing avatar folders
	output_dir: Directory to save the output GIF
	fps: Frames per second for the animation
	output_path: Optional custom output path

	Returns:
	Path to the generated GIF file

	Raises:
	FileNotFoundError: If avatar base.png or mouth frames not found
	"""
	# Locate avatar folder and files
	avatar_folder = avatars_dir / avatar_name
	base_path = avatar_folder / "base.png"
	mouth_frames_paths = sorted(avatar_folder.glob("mouth_*.png"))

	# Validate avatar files exist
	if not base_path.exists():
	raise FileNotFoundError(f"Base image not found: {base_path}")
	if not mouth_frames_paths:
	raise FileNotFoundError(f"No mouth frames found in: {avatar_folder}")

	# Load base image (the avatar face)
	base_image = Image.open(base_path).convert("RGBA")
	size = base_image.size

	# Load all mouth frame images
	mouth_frames = [
	Image.open(path).convert("RGBA").resize(size)
	for path in mouth_frames_paths
	]

	# Calculate chunk duration to match target FPS
	chunk_ms = int(1000 / fps)

	# Try to extract audio amplitude data
	try:
	rms_values = audio_to_rms_chunks(audio_path, chunk_ms=chunk_ms)
	except Exception as e:
	print(f"Primary audio processing failed: {e}")
	print("Using fallback animation method...")
	rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms)

	# Handle edge case of empty or invalid audio
	if not rms_values or all(v == 0 for v in rms_values):
	rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms)

	# Normalize RMS values to 0-1 range
	max_rms = max(rms_values) if max(rms_values) > 0 else 1

	# Generate animation frames
	frames = []
	num_mouth_frames = len(mouth_frames)

	for rms in rms_values:
	# Calculate mouth openness ratio (0 to 1)
	ratio = rms / max_rms

	# Map ratio to mouth frame index
	mouth_index = int(ratio * (num_mouth_frames - 1))
	mouth_index = max(0, min(mouth_index, num_mouth_frames - 1))

	# Composite mouth onto base image
	mouth = mouth_frames[mouth_index]
	frame = Image.alpha_composite(base_image, mouth)

	# Convert to RGB for GIF compatibility
	frame_rgb = Image.new("RGB", frame.size, (255, 255, 255))
	frame_rgb.paste(frame, mask=frame.split()[-1] if frame.mode == 'RGBA' else None)

	frames.append(frame_rgb)

	# Ensure output directory exists
	output_dir.mkdir(parents=True, exist_ok=True)

	# Generate output filename
	if output_path is None:
	timestamp = int(time.time() * 1000)
	output_path = str(output_dir / f"lipsync_{timestamp}.gif")

	# Save as animated GIF
	if frames:
	# Use imageio to save GIF
	imageio.mimsave(
	output_path,
	frames,
	fps=fps,
	loop=0 # Loop forever
	)
	else:
	raise ValueError("No frames generated for animation")

	return output_path