Spaces:

banao-tech
/

model-testing

Build error

App Files Files Community

model-testing / app_LatentSync.py

banao-tech

Rename app.py to app_LatentSync.py

940bb2b verified 18 days ago

raw

history blame contribute delete

6.82 kB

	import os
	os.environ["OMP_NUM_THREADS"] = "1"
	import subprocess
	from pathlib import Path
	from datetime import datetime
	import gradio as gr
	from huggingface_hub import snapshot_download

	ROOT = Path(__file__).parent.resolve()
	REPO_DIR = ROOT / "LatentSync"
	CKPT_DIR = REPO_DIR / "checkpoints"
	TEMP_DIR = REPO_DIR / "temp"

	# Use 1.5 on T4 16GB
	HF_CKPT_REPO = "ByteDance/LatentSync-1.5"

	def run(cmd, cwd=None):
	print(" ".join(map(str, cmd)))
	subprocess.check_call(cmd, cwd=cwd)

	def create_mask_file():
	"""Create the missing mask.png file"""
	mask_dir = REPO_DIR / "latentsync" / "utils"
	mask_path = mask_dir / "mask.png"

	if mask_path.exists():
	return

	mask_dir.mkdir(parents=True, exist_ok=True)

	# Create mask using numpy and PIL
	try:
	import numpy as np
	from PIL import Image

	# Create 256x256 mask (white = inpaint mouth area, black = keep)
	mask = np.zeros((256, 256), dtype=np.uint8)
	# Create ellipse for mouth region (lower face)
	center_x, center_y = 128, 180
	for y in range(256):
	for x in range(256):
	# Ellipse: ((x-cx)/rx)^2 + ((y-cy)/ry)^2 <= 1
	if ((x - center_x) / 90) 2 + ((y - center_y) / 64) 2 <= 1:
	mask[y, x] = 255

	Image.fromarray(mask, mode='L').save(str(mask_path))
	print(f"✓ Created mask at {mask_path}")
	except Exception as e:
	print(f"Warning: Could not create mask: {e}")

	def setup():
	if not REPO_DIR.exists():
	print("Cloning LatentSync repository...")
	run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])

	CKPT_DIR.mkdir(parents=True, exist_ok=True)
	TEMP_DIR.mkdir(parents=True, exist_ok=True)

	# Create mask file before running inference
	create_mask_file()

	# Download checkpoints
	print("Downloading model checkpoints...")
	snapshot_download(
	repo_id=HF_CKPT_REPO,
	local_dir=str(CKPT_DIR),
	local_dir_use_symlinks=False,
	)
	print("✓ Setup complete")

	def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
	"""Convert static image + audio to video"""
	out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
	cmd = [
	"ffmpeg", "-y",
	"-loop", "1", "-i", image_path,
	"-i", audio_path,
	"-shortest",
	"-r", str(fps),
	"-vf", "scale=256:256:force_original_aspect_ratio=increase,crop=256:256",
	"-pix_fmt", "yuv420p",
	"-c:v", "libx264",
	"-c:a", "aac",
	str(out_path),
	]
	run(cmd)
	return str(out_path)

	def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache):
	try:
	setup()

	if avatar_img is None:
	return None, "❌ Please upload an avatar image!"
	if audio_wav is None:
	return None, "❌ Please upload an audio file!"

	img_path = str(Path(avatar_img).resolve())
	wav_path = str(Path(audio_wav).resolve())

	# Create video from image + audio
	print("Creating input video...")
	video_path = make_still_video(img_path, wav_path, fps=25)

	out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"

	# Fixed config path for LatentSync 1.5
	cmd = [
	"python", "-m", "scripts.inference",
	"--unet_config_path", "configs/unet/stage2.yaml",
	"--inference_ckpt_path", "checkpoints/latentsync_unet.pt",
	"--video_path", video_path,
	"--audio_path", wav_path,
	"--video_out_path", str(out_path),
	"--inference_steps", str(int(steps)),
	"--guidance_scale", str(float(guidance)),
	"--seed", str(int(seed)),
	"--temp_dir", "temp",
	]

	if use_deepcache:
	cmd.append("--enable_deepcache")

	print("Generating lip-synced video...")
	run(cmd, cwd=str(REPO_DIR))

	if out_path.exists():
	return str(out_path), "✅ Video generated successfully!"
	else:
	return None, "❌ Video generation failed - output file not created"

	except subprocess.CalledProcessError as e:
	error_msg = f"❌ Command failed with return code {e.returncode}"
	return None, error_msg
	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	# Gradio Interface - Compatible with Gradio 4.44.1
	with gr.Blocks(title="LatentSync Lip Sync") as demo:
	gr.Markdown(
	"""
	# 🎬 LatentSync 1.5 - AI Lip Sync Generator

	Upload an avatar image and audio file to generate a lip-synced video!

	Tips:
	- Use clear frontal face images for best results
	- Keep audio under 30 seconds for faster processing
	- Higher inference steps = better quality but slower
	"""
	)

	with gr.Row():
	with gr.Column():
	avatar = gr.Image(
	type="filepath",
	label="📷 Avatar Image (JPG/PNG)"
	)
	audio = gr.Audio(
	type="filepath",
	label="🎵 Audio File (WAV)"
	)

	with gr.Column():
	gr.Markdown("### ⚙️ Generation Settings")
	steps = gr.Slider(
	10, 40, value=20, step=1,
	label="Inference Steps (Higher = Better Quality)"
	)
	guidance = gr.Slider(
	0.8, 2.0, value=1.0, step=0.1,
	label="Guidance Scale (Higher = Stronger Lip Sync)"
	)
	seed = gr.Number(
	value=1247, precision=0,
	label="Seed (For Reproducibility)"
	)
	deepcache = gr.Checkbox(
	value=True,
	label="Enable DeepCache (Faster - Recommended for T4)"
	)

	btn = gr.Button("🚀 Generate Lip-Synced Video", variant="primary")

	status = gr.Textbox(label="Status", interactive=False)
	out = gr.Video(label="Generated Video")

	btn.click(
	generate,
	inputs=[avatar, audio, steps, guidance, seed, deepcache],
	outputs=[out, status]
	)

	gr.Markdown(
	"""
	---
	### 📝 Notes:
	- First run will download models (~7GB) - this may take a few minutes
	- Generation takes 30-90 seconds depending on settings
	- Works best with T4 GPU (16GB)
	- Based on [LatentSync by ByteDance](https://github.com/bytedance/LatentSync)
	"""
	)

	if __name__ == "__main__":
	demo.queue(max_size=3)
	demo.launch()