File size: 6,888 Bytes
85c18a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""

Lip-Sync Animation Module

=========================

Generates animated GIFs with lip-sync based on audio amplitude.



Functions:

    - audio_to_rms_chunks: Extract amplitude data from audio

    - generate_lipsync_gif: Create lip-sync animation GIF

"""

from PIL import Image
import imageio
from pathlib import Path
import time
from typing import List, Optional
import os


def audio_to_rms_chunks(audio_path: str, chunk_ms: int = 80) -> List[float]:
    """

    Extract RMS (Root Mean Square) amplitude values from audio.

    

    Splits audio into chunks and calculates the RMS value for each,

    which represents the "loudness" of that segment.

    

    Args:

        audio_path: Path to the audio file (MP3)

        chunk_ms: Duration of each chunk in milliseconds

        

    Returns:

        List of RMS values, one per chunk

    """
    try:
        from pydub import AudioSegment
        from pydub.utils import make_chunks
        
        # Check if file exists
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")
        
        # Load audio file
        audio = AudioSegment.from_file(audio_path)
        
        # Split into chunks
        chunks = make_chunks(audio, chunk_ms)
        
        # Calculate RMS for each chunk
        rms_values = [chunk.rms for chunk in chunks if len(chunk) > 0]
        
        return rms_values if rms_values else [0]
        
    except Exception as e:
        print(f"Error processing audio: {e}")
        # Return default values if audio processing fails
        return [100, 200, 150, 300, 250, 100, 200, 150]  # Fallback animation


def audio_to_rms_chunks_simple(audio_path: str, chunk_ms: int = 80) -> List[float]:
    """

    Simple fallback method to generate fake RMS values based on file size.

    Used when pydub/ffmpeg fails.

    

    Args:

        audio_path: Path to the audio file

        chunk_ms: Duration of each chunk in milliseconds

        

    Returns:

        List of simulated RMS values

    """
    import math
    
    try:
        # Estimate duration based on file size (rough approximation)
        file_size = os.path.getsize(audio_path)
        
        # Approximate: MP3 at 128kbps = 16KB per second
        estimated_duration_sec = file_size / 16000
        
        # Calculate number of chunks
        num_chunks = max(int(estimated_duration_sec * 1000 / chunk_ms), 10)
        
        # Generate wave-like RMS values for natural-looking lip sync
        rms_values = []
        for i in range(num_chunks):
            # Create a wave pattern
            value = 150 + 100 * math.sin(i * 0.5) + 50 * math.sin(i * 1.2)
            rms_values.append(max(50, value))
        
        return rms_values
        
    except Exception:
        return [100, 200, 150, 300, 250, 100, 200, 150, 100, 200]


def generate_lipsync_gif(

    avatar_name: str,

    audio_path: str,

    avatars_dir: Path,

    output_dir: Path,

    fps: int = 12,

    output_path: Optional[str] = None

) -> str:
    """

    Generate a lip-sync animated GIF from avatar images and audio.

    

    The animation works by:

    1. Analyzing audio amplitude (RMS) over time

    2. Selecting mouth frame based on amplitude level

    3. Compositing mouth frame onto base avatar image

    4. Combining all frames into an animated GIF

    

    Args:

        avatar_name: Name of avatar folder (e.g., 'sample')

        audio_path: Path to the audio file to sync with

        avatars_dir: Base directory containing avatar folders

        output_dir: Directory to save the output GIF

        fps: Frames per second for the animation

        output_path: Optional custom output path

        

    Returns:

        Path to the generated GIF file

        

    Raises:

        FileNotFoundError: If avatar base.png or mouth frames not found

    """
    # Locate avatar folder and files
    avatar_folder = avatars_dir / avatar_name
    base_path = avatar_folder / "base.png"
    mouth_frames_paths = sorted(avatar_folder.glob("mouth_*.png"))
    
    # Validate avatar files exist
    if not base_path.exists():
        raise FileNotFoundError(f"Base image not found: {base_path}")
    if not mouth_frames_paths:
        raise FileNotFoundError(f"No mouth frames found in: {avatar_folder}")
    
    # Load base image (the avatar face)
    base_image = Image.open(base_path).convert("RGBA")
    size = base_image.size
    
    # Load all mouth frame images
    mouth_frames = [
        Image.open(path).convert("RGBA").resize(size) 
        for path in mouth_frames_paths
    ]
    
    # Calculate chunk duration to match target FPS
    chunk_ms = int(1000 / fps)
    
    # Try to extract audio amplitude data
    try:
        rms_values = audio_to_rms_chunks(audio_path, chunk_ms=chunk_ms)
    except Exception as e:
        print(f"Primary audio processing failed: {e}")
        print("Using fallback animation method...")
        rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms)
    
    # Handle edge case of empty or invalid audio
    if not rms_values or all(v == 0 for v in rms_values):
        rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms)
    
    # Normalize RMS values to 0-1 range
    max_rms = max(rms_values) if max(rms_values) > 0 else 1
    
    # Generate animation frames
    frames = []
    num_mouth_frames = len(mouth_frames)
    
    for rms in rms_values:
        # Calculate mouth openness ratio (0 to 1)
        ratio = rms / max_rms
        
        # Map ratio to mouth frame index
        mouth_index = int(ratio * (num_mouth_frames - 1))
        mouth_index = max(0, min(mouth_index, num_mouth_frames - 1))
        
        # Composite mouth onto base image
        mouth = mouth_frames[mouth_index]
        frame = Image.alpha_composite(base_image, mouth)
        
        # Convert to RGB for GIF compatibility
        frame_rgb = Image.new("RGB", frame.size, (255, 255, 255))
        frame_rgb.paste(frame, mask=frame.split()[-1] if frame.mode == 'RGBA' else None)
        
        frames.append(frame_rgb)
    
    # Ensure output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Generate output filename
    if output_path is None:
        timestamp = int(time.time() * 1000)
        output_path = str(output_dir / f"lipsync_{timestamp}.gif")
    
    # Save as animated GIF
    if frames:
        # Use imageio to save GIF
        imageio.mimsave(
            output_path, 
            frames, 
            fps=fps,
            loop=0  # Loop forever
        )
    else:
        raise ValueError("No frames generated for animation")
    
    return output_path