model-testing / app_LatentSync.py
banao-tech's picture
Rename app.py to app_LatentSync.py
940bb2b verified
import os
os.environ["OMP_NUM_THREADS"] = "1"
import subprocess
from pathlib import Path
from datetime import datetime
import gradio as gr
from huggingface_hub import snapshot_download
ROOT = Path(__file__).parent.resolve()
REPO_DIR = ROOT / "LatentSync"
CKPT_DIR = REPO_DIR / "checkpoints"
TEMP_DIR = REPO_DIR / "temp"
# Use 1.5 on T4 16GB
HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
def run(cmd, cwd=None):
print(" ".join(map(str, cmd)))
subprocess.check_call(cmd, cwd=cwd)
def create_mask_file():
"""Create the missing mask.png file"""
mask_dir = REPO_DIR / "latentsync" / "utils"
mask_path = mask_dir / "mask.png"
if mask_path.exists():
return
mask_dir.mkdir(parents=True, exist_ok=True)
# Create mask using numpy and PIL
try:
import numpy as np
from PIL import Image
# Create 256x256 mask (white = inpaint mouth area, black = keep)
mask = np.zeros((256, 256), dtype=np.uint8)
# Create ellipse for mouth region (lower face)
center_x, center_y = 128, 180
for y in range(256):
for x in range(256):
# Ellipse: ((x-cx)/rx)^2 + ((y-cy)/ry)^2 <= 1
if ((x - center_x) / 90) ** 2 + ((y - center_y) / 64) ** 2 <= 1:
mask[y, x] = 255
Image.fromarray(mask, mode='L').save(str(mask_path))
print(f"βœ“ Created mask at {mask_path}")
except Exception as e:
print(f"Warning: Could not create mask: {e}")
def setup():
if not REPO_DIR.exists():
print("Cloning LatentSync repository...")
run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
CKPT_DIR.mkdir(parents=True, exist_ok=True)
TEMP_DIR.mkdir(parents=True, exist_ok=True)
# Create mask file before running inference
create_mask_file()
# Download checkpoints
print("Downloading model checkpoints...")
snapshot_download(
repo_id=HF_CKPT_REPO,
local_dir=str(CKPT_DIR),
local_dir_use_symlinks=False,
)
print("βœ“ Setup complete")
def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
"""Convert static image + audio to video"""
out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
cmd = [
"ffmpeg", "-y",
"-loop", "1", "-i", image_path,
"-i", audio_path,
"-shortest",
"-r", str(fps),
"-vf", "scale=256:256:force_original_aspect_ratio=increase,crop=256:256",
"-pix_fmt", "yuv420p",
"-c:v", "libx264",
"-c:a", "aac",
str(out_path),
]
run(cmd)
return str(out_path)
def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache):
try:
setup()
if avatar_img is None:
return None, "❌ Please upload an avatar image!"
if audio_wav is None:
return None, "❌ Please upload an audio file!"
img_path = str(Path(avatar_img).resolve())
wav_path = str(Path(audio_wav).resolve())
# Create video from image + audio
print("Creating input video...")
video_path = make_still_video(img_path, wav_path, fps=25)
out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
# Fixed config path for LatentSync 1.5
cmd = [
"python", "-m", "scripts.inference",
"--unet_config_path", "configs/unet/stage2.yaml",
"--inference_ckpt_path", "checkpoints/latentsync_unet.pt",
"--video_path", video_path,
"--audio_path", wav_path,
"--video_out_path", str(out_path),
"--inference_steps", str(int(steps)),
"--guidance_scale", str(float(guidance)),
"--seed", str(int(seed)),
"--temp_dir", "temp",
]
if use_deepcache:
cmd.append("--enable_deepcache")
print("Generating lip-synced video...")
run(cmd, cwd=str(REPO_DIR))
if out_path.exists():
return str(out_path), "βœ… Video generated successfully!"
else:
return None, "❌ Video generation failed - output file not created"
except subprocess.CalledProcessError as e:
error_msg = f"❌ Command failed with return code {e.returncode}"
return None, error_msg
except Exception as e:
return None, f"❌ Error: {str(e)}"
# Gradio Interface - Compatible with Gradio 4.44.1
with gr.Blocks(title="LatentSync Lip Sync") as demo:
gr.Markdown(
"""
# 🎬 LatentSync 1.5 - AI Lip Sync Generator
Upload an avatar image and audio file to generate a lip-synced video!
**Tips:**
- Use clear frontal face images for best results
- Keep audio under 30 seconds for faster processing
- Higher inference steps = better quality but slower
"""
)
with gr.Row():
with gr.Column():
avatar = gr.Image(
type="filepath",
label="πŸ“· Avatar Image (JPG/PNG)"
)
audio = gr.Audio(
type="filepath",
label="🎡 Audio File (WAV)"
)
with gr.Column():
gr.Markdown("### βš™οΈ Generation Settings")
steps = gr.Slider(
10, 40, value=20, step=1,
label="Inference Steps (Higher = Better Quality)"
)
guidance = gr.Slider(
0.8, 2.0, value=1.0, step=0.1,
label="Guidance Scale (Higher = Stronger Lip Sync)"
)
seed = gr.Number(
value=1247, precision=0,
label="Seed (For Reproducibility)"
)
deepcache = gr.Checkbox(
value=True,
label="Enable DeepCache (Faster - Recommended for T4)"
)
btn = gr.Button("πŸš€ Generate Lip-Synced Video", variant="primary")
status = gr.Textbox(label="Status", interactive=False)
out = gr.Video(label="Generated Video")
btn.click(
generate,
inputs=[avatar, audio, steps, guidance, seed, deepcache],
outputs=[out, status]
)
gr.Markdown(
"""
---
### πŸ“ Notes:
- First run will download models (~7GB) - this may take a few minutes
- Generation takes 30-90 seconds depending on settings
- Works best with T4 GPU (16GB)
- Based on [LatentSync by ByteDance](https://github.com/bytedance/LatentSync)
"""
)
if __name__ == "__main__":
demo.queue(max_size=3)
demo.launch()