Spaces:

banao-tech
/

model-testing

Paused

File size: 6,816 Bytes

3c67f4c
 
2bcec7c
 
 
 
 
 
 
 
8b65f54
1a8b8ad
2bcec7c
1a8b8ad
2bcec7c
 
 
8b65f54
2bcec7c
 
4c48c35
 
 
 
eb63d21
4c48c35
 
eb63d21
4c48c35
eb63d21
4c48c35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb63d21
8b65f54
2bcec7c
4c48c35
2bcec7c
d47e052
8b65f54
1a8b8ad
d47e052
4c48c35
 
eb63d21
4c48c35
 
2bcec7c
 
8b65f54
2bcec7c
 
4c48c35
2bcec7c
1a8b8ad
4c48c35
8b65f54
2bcec7c
 
1a8b8ad
2bcec7c
 
 
 
 
 
 
 
 
 
 
 
1a8b8ad
eb63d21
 
 
 
4c48c35
eb63d21
4c48c35
eb63d21
 
 
 
4c48c35
 
eb63d21
 
 
 
4c48c35
eb63d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c48c35
eb63d21
 
 
4c48c35
eb63d21
4c48c35
eb63d21
 
4c48c35
eb63d21
 
4c48c35
2bcec7c
4c48c35
 
eb63d21
 
 
 
 
 
 
 
 
 
 
 
d47e052
1a8b8ad
eb63d21
 
 
4c48c35
eb63d21
 
 
4c48c35
eb63d21
 
 
4c48c35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d47e052
4c48c35
d47e052
eb63d21
 
d47e052
eb63d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bcec7c
eb63d21
 
4c48c35

import os
os.environ["OMP_NUM_THREADS"] = "1"
import subprocess
from pathlib import Path
from datetime import datetime
import gradio as gr
from huggingface_hub import snapshot_download

ROOT = Path(__file__).parent.resolve()
REPO_DIR = ROOT / "LatentSync"
CKPT_DIR = REPO_DIR / "checkpoints"
TEMP_DIR = REPO_DIR / "temp"

# Use 1.5 on T4 16GB
HF_CKPT_REPO = "ByteDance/LatentSync-1.5"

def run(cmd, cwd=None):
    print(" ".join(map(str, cmd)))
    subprocess.check_call(cmd, cwd=cwd)

def create_mask_file():
    """Create the missing mask.png file"""
    mask_dir = REPO_DIR / "latentsync" / "utils"
    mask_path = mask_dir / "mask.png"
    
    if mask_path.exists():
        return
    
    mask_dir.mkdir(parents=True, exist_ok=True)
    
    # Create mask using numpy and PIL
    try:
        import numpy as np
        from PIL import Image
        
        # Create 256x256 mask (white = inpaint mouth area, black = keep)
        mask = np.zeros((256, 256), dtype=np.uint8)
        # Create ellipse for mouth region (lower face)
        center_x, center_y = 128, 180
        for y in range(256):
            for x in range(256):
                # Ellipse: ((x-cx)/rx)^2 + ((y-cy)/ry)^2 <= 1
                if ((x - center_x) / 90) ** 2 + ((y - center_y) / 64) ** 2 <= 1:
                    mask[y, x] = 255
        
        Image.fromarray(mask, mode='L').save(str(mask_path))
        print(f"✓ Created mask at {mask_path}")
    except Exception as e:
        print(f"Warning: Could not create mask: {e}")

def setup():
    if not REPO_DIR.exists():
        print("Cloning LatentSync repository...")
        run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
    
    CKPT_DIR.mkdir(parents=True, exist_ok=True)
    TEMP_DIR.mkdir(parents=True, exist_ok=True)
    
    # Create mask file before running inference
    create_mask_file()
    
    # Download checkpoints
    print("Downloading model checkpoints...")
    snapshot_download(
        repo_id=HF_CKPT_REPO,
        local_dir=str(CKPT_DIR),
        local_dir_use_symlinks=False,
    )
    print("✓ Setup complete")

def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
    """Convert static image + audio to video"""
    out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
    cmd = [
        "ffmpeg", "-y",
        "-loop", "1", "-i", image_path,
        "-i", audio_path,
        "-shortest",
        "-r", str(fps),
        "-vf", "scale=256:256:force_original_aspect_ratio=increase,crop=256:256",
        "-pix_fmt", "yuv420p",
        "-c:v", "libx264",
        "-c:a", "aac",
        str(out_path),
    ]
    run(cmd)
    return str(out_path)

def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache):
    try:
        setup()
        
        if avatar_img is None:
            return None, "❌ Please upload an avatar image!"
        if audio_wav is None:
            return None, "❌ Please upload an audio file!"
        
        img_path = str(Path(avatar_img).resolve())
        wav_path = str(Path(audio_wav).resolve())
        
        # Create video from image + audio
        print("Creating input video...")
        video_path = make_still_video(img_path, wav_path, fps=25)
        
        out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
        
        # Fixed config path for LatentSync 1.5
        cmd = [
            "python", "-m", "scripts.inference",
            "--unet_config_path", "configs/unet/stage2.yaml",
            "--inference_ckpt_path", "checkpoints/latentsync_unet.pt",
            "--video_path", video_path,
            "--audio_path", wav_path,
            "--video_out_path", str(out_path),
            "--inference_steps", str(int(steps)),
            "--guidance_scale", str(float(guidance)),
            "--seed", str(int(seed)),
            "--temp_dir", "temp",
        ]
        
        if use_deepcache:
            cmd.append("--enable_deepcache")
        
        print("Generating lip-synced video...")
        run(cmd, cwd=str(REPO_DIR))
        
        if out_path.exists():
            return str(out_path), "✅ Video generated successfully!"
        else:
            return None, "❌ Video generation failed - output file not created"
            
    except subprocess.CalledProcessError as e:
        error_msg = f"❌ Command failed with return code {e.returncode}"
        return None, error_msg
    except Exception as e:
        return None, f"❌ Error: {str(e)}"

# Gradio Interface - Compatible with Gradio 4.44.1
with gr.Blocks(title="LatentSync Lip Sync") as demo:
    gr.Markdown(
        """
        # 🎬 LatentSync 1.5 - AI Lip Sync Generator
        
        Upload an avatar image and audio file to generate a lip-synced video!
        
        **Tips:**
        - Use clear frontal face images for best results
        - Keep audio under 30 seconds for faster processing
        - Higher inference steps = better quality but slower
        """
    )
    
    with gr.Row():
        with gr.Column():
            avatar = gr.Image(
                type="filepath", 
                label="📷 Avatar Image (JPG/PNG)"
            )
            audio = gr.Audio(
                type="filepath", 
                label="🎵 Audio File (WAV)"
            )
        
        with gr.Column():
            gr.Markdown("### ⚙️ Generation Settings")
            steps = gr.Slider(
                10, 40, value=20, step=1, 
                label="Inference Steps (Higher = Better Quality)"
            )
            guidance = gr.Slider(
                0.8, 2.0, value=1.0, step=0.1, 
                label="Guidance Scale (Higher = Stronger Lip Sync)"
            )
            seed = gr.Number(
                value=1247, precision=0, 
                label="Seed (For Reproducibility)"
            )
            deepcache = gr.Checkbox(
                value=True, 
                label="Enable DeepCache (Faster - Recommended for T4)"
            )
    
    btn = gr.Button("🚀 Generate Lip-Synced Video", variant="primary")
    
    status = gr.Textbox(label="Status", interactive=False)
    out = gr.Video(label="Generated Video")
    
    btn.click(
        generate, 
        inputs=[avatar, audio, steps, guidance, seed, deepcache], 
        outputs=[out, status]
    )
    
    gr.Markdown(
        """
        ---
        ### 📝 Notes:
        - First run will download models (~7GB) - this may take a few minutes
        - Generation takes 30-90 seconds depending on settings
        - Works best with T4 GPU (16GB)
        - Based on [LatentSync by ByteDance](https://github.com/bytedance/LatentSync)
        """
    )

if __name__ == "__main__":
    demo.queue(max_size=3)
    demo.launch()