import os os.environ["OMP_NUM_THREADS"] = "1" import subprocess from pathlib import Path from datetime import datetime import gradio as gr from huggingface_hub import snapshot_download ROOT = Path(__file__).parent.resolve() REPO_DIR = ROOT / "LatentSync" CKPT_DIR = REPO_DIR / "checkpoints" TEMP_DIR = REPO_DIR / "temp" # Use 1.5 on T4 16GB HF_CKPT_REPO = "ByteDance/LatentSync-1.5" def run(cmd, cwd=None): print(" ".join(map(str, cmd))) subprocess.check_call(cmd, cwd=cwd) def create_mask_file(): """Create the missing mask.png file""" mask_dir = REPO_DIR / "latentsync" / "utils" mask_path = mask_dir / "mask.png" if mask_path.exists(): return mask_dir.mkdir(parents=True, exist_ok=True) # Create mask using numpy and PIL try: import numpy as np from PIL import Image # Create 256x256 mask (white = inpaint mouth area, black = keep) mask = np.zeros((256, 256), dtype=np.uint8) # Create ellipse for mouth region (lower face) center_x, center_y = 128, 180 for y in range(256): for x in range(256): # Ellipse: ((x-cx)/rx)^2 + ((y-cy)/ry)^2 <= 1 if ((x - center_x) / 90) ** 2 + ((y - center_y) / 64) ** 2 <= 1: mask[y, x] = 255 Image.fromarray(mask, mode='L').save(str(mask_path)) print(f"✓ Created mask at {mask_path}") except Exception as e: print(f"Warning: Could not create mask: {e}") def setup(): if not REPO_DIR.exists(): print("Cloning LatentSync repository...") run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)]) CKPT_DIR.mkdir(parents=True, exist_ok=True) TEMP_DIR.mkdir(parents=True, exist_ok=True) # Create mask file before running inference create_mask_file() # Download checkpoints print("Downloading model checkpoints...") snapshot_download( repo_id=HF_CKPT_REPO, local_dir=str(CKPT_DIR), local_dir_use_symlinks=False, ) print("✓ Setup complete") def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str: """Convert static image + audio to video""" out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4" cmd = [ "ffmpeg", "-y", "-loop", "1", "-i", image_path, "-i", audio_path, "-shortest", "-r", str(fps), "-vf", "scale=256:256:force_original_aspect_ratio=increase,crop=256:256", "-pix_fmt", "yuv420p", "-c:v", "libx264", "-c:a", "aac", str(out_path), ] run(cmd) return str(out_path) def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache): try: setup() if avatar_img is None: return None, "❌ Please upload an avatar image!" if audio_wav is None: return None, "❌ Please upload an audio file!" img_path = str(Path(avatar_img).resolve()) wav_path = str(Path(audio_wav).resolve()) # Create video from image + audio print("Creating input video...") video_path = make_still_video(img_path, wav_path, fps=25) out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4" # Fixed config path for LatentSync 1.5 cmd = [ "python", "-m", "scripts.inference", "--unet_config_path", "configs/unet/stage2.yaml", "--inference_ckpt_path", "checkpoints/latentsync_unet.pt", "--video_path", video_path, "--audio_path", wav_path, "--video_out_path", str(out_path), "--inference_steps", str(int(steps)), "--guidance_scale", str(float(guidance)), "--seed", str(int(seed)), "--temp_dir", "temp", ] if use_deepcache: cmd.append("--enable_deepcache") print("Generating lip-synced video...") run(cmd, cwd=str(REPO_DIR)) if out_path.exists(): return str(out_path), "✅ Video generated successfully!" else: return None, "❌ Video generation failed - output file not created" except subprocess.CalledProcessError as e: error_msg = f"❌ Command failed with return code {e.returncode}" return None, error_msg except Exception as e: return None, f"❌ Error: {str(e)}" # Gradio Interface - Compatible with Gradio 4.44.1 with gr.Blocks(title="LatentSync Lip Sync") as demo: gr.Markdown( """ # 🎬 LatentSync 1.5 - AI Lip Sync Generator Upload an avatar image and audio file to generate a lip-synced video! **Tips:** - Use clear frontal face images for best results - Keep audio under 30 seconds for faster processing - Higher inference steps = better quality but slower """ ) with gr.Row(): with gr.Column(): avatar = gr.Image( type="filepath", label="📷 Avatar Image (JPG/PNG)" ) audio = gr.Audio( type="filepath", label="🎵 Audio File (WAV)" ) with gr.Column(): gr.Markdown("### ⚙️ Generation Settings") steps = gr.Slider( 10, 40, value=20, step=1, label="Inference Steps (Higher = Better Quality)" ) guidance = gr.Slider( 0.8, 2.0, value=1.0, step=0.1, label="Guidance Scale (Higher = Stronger Lip Sync)" ) seed = gr.Number( value=1247, precision=0, label="Seed (For Reproducibility)" ) deepcache = gr.Checkbox( value=True, label="Enable DeepCache (Faster - Recommended for T4)" ) btn = gr.Button("🚀 Generate Lip-Synced Video", variant="primary") status = gr.Textbox(label="Status", interactive=False) out = gr.Video(label="Generated Video") btn.click( generate, inputs=[avatar, audio, steps, guidance, seed, deepcache], outputs=[out, status] ) gr.Markdown( """ --- ### 📝 Notes: - First run will download models (~7GB) - this may take a few minutes - Generation takes 30-90 seconds depending on settings - Works best with T4 GPU (16GB) - Based on [LatentSync by ByteDance](https://github.com/bytedance/LatentSync) """ ) if __name__ == "__main__": demo.queue(max_size=3) demo.launch()