Spaces:

banao-tech
/

model-testing

Build error

App Files Files Community

banao-tech commited on 25 days ago

Commit

2bcec7c

verified ·

1 Parent(s): 7f287c5

Create app.py

Browse files

Files changed (1) hide show

app.py +184 -0

app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import os
+import sys
+import subprocess
+from pathlib import Path
+from datetime import datetime
+import gradio as gr
+from huggingface_hub import snapshot_download
+# -----------------------------
+# Paths
+# -----------------------------
+ROOT = Path(__file__).parent.resolve()
+REPO_DIR = ROOT / "LatentSync"
+CHECKPOINTS_DIR = REPO_DIR / "checkpoints"
+TEMP_DIR = REPO_DIR / "temp"
+# LatentSync 1.5 checkpoint repo (fits T4 16GB)
+HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
+# For LatentSync 1.5, config is typically stage2.yaml (256 resolution)
+# (LatentSync has multiple configs; stage2_512.yaml is for 1.6 / 512 training)
+CONFIG_REL_PATH = Path("configs/unet/stage2.yaml")
+CKPT_REL_PATH = Path("checkpoints/latentsync_unet.pt")
+def run(cmd, cwd=None):
+    print("Running:", " ".join(map(str, cmd)))
+    subprocess.check_call(cmd, cwd=cwd)
+def ensure_repo():
+    if not REPO_DIR.exists():
+        run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
+def ensure_checkpoints():
+    CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
+    # Download checkpoint + whisper tiny into LatentSync/checkpoints
+    # HF repo tree includes `latentsync_unet.pt` and `whisper/...` :contentReference[oaicite:4]{index=4}
+    snapshot_download(
+        repo_id=HF_CKPT_REPO,
+        local_dir=str(CHECKPOINTS_DIR),
+        local_dir_use_symlinks=False,
+        allow_patterns=[
+            "latentsync_unet.pt",
+            "whisper/*",
+        ],
+    )
+    ckpt = CHECKPOINTS_DIR / "latentsync_unet.pt"
+    whisper_tiny = CHECKPOINTS_DIR / "whisper" / "tiny.pt"
+    if not ckpt.exists():
+        raise FileNotFoundError(f"Missing checkpoint: {ckpt}")
+    if not whisper_tiny.exists():
+        raise FileNotFoundError(f"Missing whisper tiny: {whisper_tiny}")
+def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
+    TEMP_DIR.mkdir(parents=True, exist_ok=True)
+    out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
+    # Create a video by looping the image and cutting to audio length.
+    # Also scale/crop to a square size (256) to match stage2.yaml typical setting.
+    # If you switch to 1.6 later, you'd scale/crop to 512 and use stage2_512.yaml.
+    cmd = [
+        "ffmpeg", "-y",
+        "-loop", "1",
+        "-i", image_path,
+        "-i", audio_path,
+        "-shortest",
+        "-r", str(fps),
+        "-vf", "scale=256:256:force_original_aspect_ratio=increase,crop=256:256",
+        "-pix_fmt", "yuv420p",
+        "-c:v", "libx264",
+        "-c:a", "aac",
+        str(out_path),
+    ]
+    run(cmd)
+    return str(out_path)
+def latentsync_infer(video_path: str, audio_path: str, inference_steps: int, guidance_scale: float, seed: int) -> str:
+    # Import LatentSync inference code
+    sys.path.insert(0, str(REPO_DIR))
+    os.chdir(str(REPO_DIR))
+    from omegaconf import OmegaConf
+    from scripts.inference import main
+    import argparse
+    config_path = (REPO_DIR / CONFIG_REL_PATH).resolve()
+    ckpt_path = (REPO_DIR / CKPT_REL_PATH).resolve()
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config not found: {config_path}")
+    if not ckpt_path.exists():
+        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
+    TEMP_DIR.mkdir(parents=True, exist_ok=True)
+    out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
+    config = OmegaConf.load(str(config_path))
+    config["run"].update(
+        {
+            "guidance_scale": float(guidance_scale),
+            "inference_steps": int(inference_steps),
+        }
+    )
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--inference_ckpt_path", type=str, required=True)
+    parser.add_argument("--video_path", type=str, required=True)
+    parser.add_argument("--audio_path", type=str, required=True)
+    parser.add_argument("--video_out_path", type=str, required=True)
+    parser.add_argument("--inference_steps", type=int, default=20)
+    parser.add_argument("--guidance_scale", type=float, default=1.5)
+    parser.add_argument("--temp_dir", type=str, default="temp")
+    parser.add_argument("--seed", type=int, default=1247)
+    parser.add_argument("--enable_deepcache", action="store_true")
+    args = parser.parse_args(
+        [
+            "--inference_ckpt_path",
+            str(ckpt_path),
+            "--video_path",
+            str(Path(video_path).resolve()),
+            "--audio_path",
+            str(Path(audio_path).resolve()),
+            "--video_out_path",
+            str(out_path.resolve()),
+            "--inference_steps",
+            str(inference_steps),
+            "--guidance_scale",
+            str(guidance_scale),
+            "--seed",
+            str(seed),
+            "--temp_dir",
+            "temp",
+            "--enable_deepcache",
+        ]
+    )
+    main(config=config, args=args)
+    return str(out_path)
+def generate(avatar_img, audio_wav, inference_steps, guidance_scale, seed):
+    ensure_repo()
+    ensure_checkpoints()
+    # avatar_img is a filepath (type="filepath")
+    # audio_wav is a filepath (type="filepath")
+    still_video = make_still_video(avatar_img, audio_wav, fps=25)
+    result = latentsync_infer(still_video, audio_wav, inference_steps, guidance_scale, seed)
+    return result
+with gr.Blocks(title="LatentSync (avatar.jpg + audio.wav → lip-sync mp4)") as demo:
+    gr.Markdown(
+        """
+# LatentSync (HF Space)
+Upload **avatar.jpg** + **audio.wav** → get lip-sync **mp4**.
+(Uses **LatentSync 1.5** to fit **T4 16GB VRAM**.)
+"""
+    )
+    with gr.Row():
+        avatar = gr.Image(label="Avatar Image (jpg/png)", type="filepath")
+        audio = gr.Audio(label="Audio (wav)", type="filepath")
+    with gr.Row():
+        guidance = gr.Slider(1.0, 3.0, value=1.5, step=0.1, label="Guidance Scale")
+        steps = gr.Slider(10, 50, value=20, step=1, label="Inference Steps")
+        seed = gr.Number(value=1247, precision=0, label="Seed")
+    btn = gr.Button("Generate Lip-Sync Video")
+    out = gr.Video(label="Output MP4")
+    btn.click(fn=generate, inputs=[avatar, audio, steps, guidance, seed], outputs=out)
+demo.launch()