File size: 6,816 Bytes
3c67f4c
 
2bcec7c
 
 
 
 
 
 
 
8b65f54
1a8b8ad
2bcec7c
1a8b8ad
2bcec7c
 
 
8b65f54
2bcec7c
 
4c48c35
 
 
 
eb63d21
4c48c35
 
eb63d21
4c48c35
eb63d21
4c48c35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb63d21
8b65f54
2bcec7c
4c48c35
2bcec7c
d47e052
8b65f54
1a8b8ad
d47e052
4c48c35
 
eb63d21
4c48c35
 
2bcec7c
 
8b65f54
2bcec7c
 
4c48c35
2bcec7c
1a8b8ad
4c48c35
8b65f54
2bcec7c
 
1a8b8ad
2bcec7c
 
 
 
 
 
 
 
 
 
 
 
1a8b8ad
eb63d21
 
 
 
4c48c35
eb63d21
4c48c35
eb63d21
 
 
 
4c48c35
 
eb63d21
 
 
 
4c48c35
eb63d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c48c35
eb63d21
 
 
4c48c35
eb63d21
4c48c35
eb63d21
 
4c48c35
eb63d21
 
4c48c35
2bcec7c
4c48c35
 
eb63d21
 
 
 
 
 
 
 
 
 
 
 
d47e052
1a8b8ad
eb63d21
 
 
4c48c35
eb63d21
 
 
4c48c35
eb63d21
 
 
4c48c35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d47e052
4c48c35
d47e052
eb63d21
 
d47e052
eb63d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bcec7c
eb63d21
 
4c48c35
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import os
os.environ["OMP_NUM_THREADS"] = "1"
import subprocess
from pathlib import Path
from datetime import datetime
import gradio as gr
from huggingface_hub import snapshot_download

ROOT = Path(__file__).parent.resolve()
REPO_DIR = ROOT / "LatentSync"
CKPT_DIR = REPO_DIR / "checkpoints"
TEMP_DIR = REPO_DIR / "temp"

# Use 1.5 on T4 16GB
HF_CKPT_REPO = "ByteDance/LatentSync-1.5"

def run(cmd, cwd=None):
    print(" ".join(map(str, cmd)))
    subprocess.check_call(cmd, cwd=cwd)

def create_mask_file():
    """Create the missing mask.png file"""
    mask_dir = REPO_DIR / "latentsync" / "utils"
    mask_path = mask_dir / "mask.png"
    
    if mask_path.exists():
        return
    
    mask_dir.mkdir(parents=True, exist_ok=True)
    
    # Create mask using numpy and PIL
    try:
        import numpy as np
        from PIL import Image
        
        # Create 256x256 mask (white = inpaint mouth area, black = keep)
        mask = np.zeros((256, 256), dtype=np.uint8)
        # Create ellipse for mouth region (lower face)
        center_x, center_y = 128, 180
        for y in range(256):
            for x in range(256):
                # Ellipse: ((x-cx)/rx)^2 + ((y-cy)/ry)^2 <= 1
                if ((x - center_x) / 90) ** 2 + ((y - center_y) / 64) ** 2 <= 1:
                    mask[y, x] = 255
        
        Image.fromarray(mask, mode='L').save(str(mask_path))
        print(f"βœ“ Created mask at {mask_path}")
    except Exception as e:
        print(f"Warning: Could not create mask: {e}")

def setup():
    if not REPO_DIR.exists():
        print("Cloning LatentSync repository...")
        run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
    
    CKPT_DIR.mkdir(parents=True, exist_ok=True)
    TEMP_DIR.mkdir(parents=True, exist_ok=True)
    
    # Create mask file before running inference
    create_mask_file()
    
    # Download checkpoints
    print("Downloading model checkpoints...")
    snapshot_download(
        repo_id=HF_CKPT_REPO,
        local_dir=str(CKPT_DIR),
        local_dir_use_symlinks=False,
    )
    print("βœ“ Setup complete")

def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
    """Convert static image + audio to video"""
    out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
    cmd = [
        "ffmpeg", "-y",
        "-loop", "1", "-i", image_path,
        "-i", audio_path,
        "-shortest",
        "-r", str(fps),
        "-vf", "scale=256:256:force_original_aspect_ratio=increase,crop=256:256",
        "-pix_fmt", "yuv420p",
        "-c:v", "libx264",
        "-c:a", "aac",
        str(out_path),
    ]
    run(cmd)
    return str(out_path)

def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache):
    try:
        setup()
        
        if avatar_img is None:
            return None, "❌ Please upload an avatar image!"
        if audio_wav is None:
            return None, "❌ Please upload an audio file!"
        
        img_path = str(Path(avatar_img).resolve())
        wav_path = str(Path(audio_wav).resolve())
        
        # Create video from image + audio
        print("Creating input video...")
        video_path = make_still_video(img_path, wav_path, fps=25)
        
        out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
        
        # Fixed config path for LatentSync 1.5
        cmd = [
            "python", "-m", "scripts.inference",
            "--unet_config_path", "configs/unet/stage2.yaml",
            "--inference_ckpt_path", "checkpoints/latentsync_unet.pt",
            "--video_path", video_path,
            "--audio_path", wav_path,
            "--video_out_path", str(out_path),
            "--inference_steps", str(int(steps)),
            "--guidance_scale", str(float(guidance)),
            "--seed", str(int(seed)),
            "--temp_dir", "temp",
        ]
        
        if use_deepcache:
            cmd.append("--enable_deepcache")
        
        print("Generating lip-synced video...")
        run(cmd, cwd=str(REPO_DIR))
        
        if out_path.exists():
            return str(out_path), "βœ… Video generated successfully!"
        else:
            return None, "❌ Video generation failed - output file not created"
            
    except subprocess.CalledProcessError as e:
        error_msg = f"❌ Command failed with return code {e.returncode}"
        return None, error_msg
    except Exception as e:
        return None, f"❌ Error: {str(e)}"

# Gradio Interface - Compatible with Gradio 4.44.1
with gr.Blocks(title="LatentSync Lip Sync") as demo:
    gr.Markdown(
        """
        # 🎬 LatentSync 1.5 - AI Lip Sync Generator
        
        Upload an avatar image and audio file to generate a lip-synced video!
        
        **Tips:**
        - Use clear frontal face images for best results
        - Keep audio under 30 seconds for faster processing
        - Higher inference steps = better quality but slower
        """
    )
    
    with gr.Row():
        with gr.Column():
            avatar = gr.Image(
                type="filepath", 
                label="πŸ“· Avatar Image (JPG/PNG)"
            )
            audio = gr.Audio(
                type="filepath", 
                label="🎡 Audio File (WAV)"
            )
        
        with gr.Column():
            gr.Markdown("### βš™οΈ Generation Settings")
            steps = gr.Slider(
                10, 40, value=20, step=1, 
                label="Inference Steps (Higher = Better Quality)"
            )
            guidance = gr.Slider(
                0.8, 2.0, value=1.0, step=0.1, 
                label="Guidance Scale (Higher = Stronger Lip Sync)"
            )
            seed = gr.Number(
                value=1247, precision=0, 
                label="Seed (For Reproducibility)"
            )
            deepcache = gr.Checkbox(
                value=True, 
                label="Enable DeepCache (Faster - Recommended for T4)"
            )
    
    btn = gr.Button("πŸš€ Generate Lip-Synced Video", variant="primary")
    
    status = gr.Textbox(label="Status", interactive=False)
    out = gr.Video(label="Generated Video")
    
    btn.click(
        generate, 
        inputs=[avatar, audio, steps, guidance, seed, deepcache], 
        outputs=[out, status]
    )
    
    gr.Markdown(
        """
        ---
        ### πŸ“ Notes:
        - First run will download models (~7GB) - this may take a few minutes
        - Generation takes 30-90 seconds depending on settings
        - Works best with T4 GPU (16GB)
        - Based on [LatentSync by ByteDance](https://github.com/bytedance/LatentSync)
        """
    )

if __name__ == "__main__":
    demo.queue(max_size=3)
    demo.launch()