Spaces:
Build error
Build error
| import os | |
| os.environ["OMP_NUM_THREADS"] = "1" | |
| import subprocess | |
| from pathlib import Path | |
| from datetime import datetime | |
| import gradio as gr | |
| from huggingface_hub import snapshot_download | |
| ROOT = Path(__file__).parent.resolve() | |
| REPO_DIR = ROOT / "LatentSync" | |
| CKPT_DIR = REPO_DIR / "checkpoints" | |
| TEMP_DIR = REPO_DIR / "temp" | |
| # Use 1.5 on T4 16GB | |
| HF_CKPT_REPO = "ByteDance/LatentSync-1.5" | |
| def run(cmd, cwd=None): | |
| print(" ".join(map(str, cmd))) | |
| subprocess.check_call(cmd, cwd=cwd) | |
| def create_mask_file(): | |
| """Create the missing mask.png file""" | |
| mask_dir = REPO_DIR / "latentsync" / "utils" | |
| mask_path = mask_dir / "mask.png" | |
| if mask_path.exists(): | |
| return | |
| mask_dir.mkdir(parents=True, exist_ok=True) | |
| # Create mask using numpy and PIL | |
| try: | |
| import numpy as np | |
| from PIL import Image | |
| # Create 256x256 mask (white = inpaint mouth area, black = keep) | |
| mask = np.zeros((256, 256), dtype=np.uint8) | |
| # Create ellipse for mouth region (lower face) | |
| center_x, center_y = 128, 180 | |
| for y in range(256): | |
| for x in range(256): | |
| # Ellipse: ((x-cx)/rx)^2 + ((y-cy)/ry)^2 <= 1 | |
| if ((x - center_x) / 90) ** 2 + ((y - center_y) / 64) ** 2 <= 1: | |
| mask[y, x] = 255 | |
| Image.fromarray(mask, mode='L').save(str(mask_path)) | |
| print(f"β Created mask at {mask_path}") | |
| except Exception as e: | |
| print(f"Warning: Could not create mask: {e}") | |
| def setup(): | |
| if not REPO_DIR.exists(): | |
| print("Cloning LatentSync repository...") | |
| run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)]) | |
| CKPT_DIR.mkdir(parents=True, exist_ok=True) | |
| TEMP_DIR.mkdir(parents=True, exist_ok=True) | |
| # Create mask file before running inference | |
| create_mask_file() | |
| # Download checkpoints | |
| print("Downloading model checkpoints...") | |
| snapshot_download( | |
| repo_id=HF_CKPT_REPO, | |
| local_dir=str(CKPT_DIR), | |
| local_dir_use_symlinks=False, | |
| ) | |
| print("β Setup complete") | |
| def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str: | |
| """Convert static image + audio to video""" | |
| out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4" | |
| cmd = [ | |
| "ffmpeg", "-y", | |
| "-loop", "1", "-i", image_path, | |
| "-i", audio_path, | |
| "-shortest", | |
| "-r", str(fps), | |
| "-vf", "scale=256:256:force_original_aspect_ratio=increase,crop=256:256", | |
| "-pix_fmt", "yuv420p", | |
| "-c:v", "libx264", | |
| "-c:a", "aac", | |
| str(out_path), | |
| ] | |
| run(cmd) | |
| return str(out_path) | |
| def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache): | |
| try: | |
| setup() | |
| if avatar_img is None: | |
| return None, "β Please upload an avatar image!" | |
| if audio_wav is None: | |
| return None, "β Please upload an audio file!" | |
| img_path = str(Path(avatar_img).resolve()) | |
| wav_path = str(Path(audio_wav).resolve()) | |
| # Create video from image + audio | |
| print("Creating input video...") | |
| video_path = make_still_video(img_path, wav_path, fps=25) | |
| out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4" | |
| # Fixed config path for LatentSync 1.5 | |
| cmd = [ | |
| "python", "-m", "scripts.inference", | |
| "--unet_config_path", "configs/unet/stage2.yaml", | |
| "--inference_ckpt_path", "checkpoints/latentsync_unet.pt", | |
| "--video_path", video_path, | |
| "--audio_path", wav_path, | |
| "--video_out_path", str(out_path), | |
| "--inference_steps", str(int(steps)), | |
| "--guidance_scale", str(float(guidance)), | |
| "--seed", str(int(seed)), | |
| "--temp_dir", "temp", | |
| ] | |
| if use_deepcache: | |
| cmd.append("--enable_deepcache") | |
| print("Generating lip-synced video...") | |
| run(cmd, cwd=str(REPO_DIR)) | |
| if out_path.exists(): | |
| return str(out_path), "β Video generated successfully!" | |
| else: | |
| return None, "β Video generation failed - output file not created" | |
| except subprocess.CalledProcessError as e: | |
| error_msg = f"β Command failed with return code {e.returncode}" | |
| return None, error_msg | |
| except Exception as e: | |
| return None, f"β Error: {str(e)}" | |
| # Gradio Interface - Compatible with Gradio 4.44.1 | |
| with gr.Blocks(title="LatentSync Lip Sync") as demo: | |
| gr.Markdown( | |
| """ | |
| # π¬ LatentSync 1.5 - AI Lip Sync Generator | |
| Upload an avatar image and audio file to generate a lip-synced video! | |
| **Tips:** | |
| - Use clear frontal face images for best results | |
| - Keep audio under 30 seconds for faster processing | |
| - Higher inference steps = better quality but slower | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| avatar = gr.Image( | |
| type="filepath", | |
| label="π· Avatar Image (JPG/PNG)" | |
| ) | |
| audio = gr.Audio( | |
| type="filepath", | |
| label="π΅ Audio File (WAV)" | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### βοΈ Generation Settings") | |
| steps = gr.Slider( | |
| 10, 40, value=20, step=1, | |
| label="Inference Steps (Higher = Better Quality)" | |
| ) | |
| guidance = gr.Slider( | |
| 0.8, 2.0, value=1.0, step=0.1, | |
| label="Guidance Scale (Higher = Stronger Lip Sync)" | |
| ) | |
| seed = gr.Number( | |
| value=1247, precision=0, | |
| label="Seed (For Reproducibility)" | |
| ) | |
| deepcache = gr.Checkbox( | |
| value=True, | |
| label="Enable DeepCache (Faster - Recommended for T4)" | |
| ) | |
| btn = gr.Button("π Generate Lip-Synced Video", variant="primary") | |
| status = gr.Textbox(label="Status", interactive=False) | |
| out = gr.Video(label="Generated Video") | |
| btn.click( | |
| generate, | |
| inputs=[avatar, audio, steps, guidance, seed, deepcache], | |
| outputs=[out, status] | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### π Notes: | |
| - First run will download models (~7GB) - this may take a few minutes | |
| - Generation takes 30-90 seconds depending on settings | |
| - Works best with T4 GPU (16GB) | |
| - Based on [LatentSync by ByteDance](https://github.com/bytedance/LatentSync) | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=3) | |
| demo.launch() |