Spaces:

banao-tech
/

model-testing

Build error

App Files Files Community

banao-tech commited on 25 days ago

Commit

8b65f54

verified ·

1 Parent(s): 85c5a8d

More robust App.py

Browse files

Files changed (1) hide show

app.py +40 -132

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import sys
 import subprocess
 from pathlib import Path
 from datetime import datetime
@@ -7,68 +6,39 @@ from datetime import datetime
 import gradio as gr
 from huggingface_hub import snapshot_download
-# -----------------------------
-# Paths
-# -----------------------------
 ROOT = Path(__file__).parent.resolve()
 REPO_DIR = ROOT / "LatentSync"
-CHECKPOINTS_DIR = REPO_DIR / "checkpoints"
 TEMP_DIR = REPO_DIR / "temp"
-# LatentSync 1.5 checkpoint repo (fits T4 16GB)
 HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
-# For LatentSync 1.5, config is typically stage2.yaml (256 resolution)
-# (LatentSync has multiple configs; stage2_512.yaml is for 1.6 / 512 training)
-CONFIG_REL_PATH = Path("configs/unet/stage2.yaml")
-CKPT_REL_PATH = Path("checkpoints/latentsync_unet.pt")
 def run(cmd, cwd=None):
-    print("Running:", " ".join(map(str, cmd)))
     subprocess.check_call(cmd, cwd=cwd)
-def ensure_repo():
     if not REPO_DIR.exists():
         run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
-def ensure_checkpoints():
-    CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
-    # Download checkpoint + whisper tiny into LatentSync/checkpoints
-    # HF repo tree includes `latentsync_unet.pt` and `whisper/...` :contentReference[oaicite:4]{index=4}
     snapshot_download(
         repo_id=HF_CKPT_REPO,
-        local_dir=str(CHECKPOINTS_DIR),
         local_dir_use_symlinks=False,
-        allow_patterns=[
-            "latentsync_unet.pt",
-            "whisper/*",
-        ],
     )
-    ckpt = CHECKPOINTS_DIR / "latentsync_unet.pt"
-    whisper_tiny = CHECKPOINTS_DIR / "whisper" / "tiny.pt"
-    if not ckpt.exists():
-        raise FileNotFoundError(f"Missing checkpoint: {ckpt}")
-    if not whisper_tiny.exists():
-        raise FileNotFoundError(f"Missing whisper tiny: {whisper_tiny}")
-def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
     TEMP_DIR.mkdir(parents=True, exist_ok=True)
-    out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
-    # Create a video by looping the image and cutting to audio length.
-    # Also scale/crop to a square size (256) to match stage2.yaml typical setting.
-    # If you switch to 1.6 later, you'd scale/crop to 512 and use stage2_512.yaml.
     cmd = [
         "ffmpeg", "-y",
-        "-loop", "1",
-        "-i", image_path,
         "-i", audio_path,
         "-shortest",
         "-r", str(fps),
@@ -81,104 +51,42 @@ def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
     run(cmd)
     return str(out_path)
-def latentsync_infer(video_path: str, audio_path: str, inference_steps: int, guidance_scale: float, seed: int) -> str:
-    # Import LatentSync inference code
-    sys.path.insert(0, str(REPO_DIR))
-    os.chdir(str(REPO_DIR))
-    from omegaconf import OmegaConf
-    from scripts.inference import main
-    import argparse
-    config_path = (REPO_DIR / CONFIG_REL_PATH).resolve()
-    ckpt_path = (REPO_DIR / CKPT_REL_PATH).resolve()
-    if not config_path.exists():
-        raise FileNotFoundError(f"Config not found: {config_path}")
-    if not ckpt_path.exists():
-        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
-    TEMP_DIR.mkdir(parents=True, exist_ok=True)
-    out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
-    config = OmegaConf.load(str(config_path))
-    config["run"].update(
-        {
-            "guidance_scale": float(guidance_scale),
-            "inference_steps": int(inference_steps),
-        }
-    )
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--inference_ckpt_path", type=str, required=True)
-    parser.add_argument("--video_path", type=str, required=True)
-    parser.add_argument("--audio_path", type=str, required=True)
-    parser.add_argument("--video_out_path", type=str, required=True)
-    parser.add_argument("--inference_steps", type=int, default=20)
-    parser.add_argument("--guidance_scale", type=float, default=1.5)
-    parser.add_argument("--temp_dir", type=str, default="temp")
-    parser.add_argument("--seed", type=int, default=1247)
-    parser.add_argument("--enable_deepcache", action="store_true")
-    args = parser.parse_args(
-        [
-            "--inference_ckpt_path",
-            str(ckpt_path),
-            "--video_path",
-            str(Path(video_path).resolve()),
-            "--audio_path",
-            str(Path(audio_path).resolve()),
-            "--video_out_path",
-            str(out_path.resolve()),
-            "--inference_steps",
-            str(inference_steps),
-            "--guidance_scale",
-            str(guidance_scale),
-            "--seed",
-            str(seed),
-            "--temp_dir",
-            "temp",
-            "--enable_deepcache",
-        ]
-    )
-    main(config=config, args=args)
     return str(out_path)
-def generate(avatar_img, audio_wav, inference_steps, guidance_scale, seed):
-    ensure_repo()
-    ensure_checkpoints()
-    # avatar_img is a filepath (type="filepath")
-    # audio_wav is a filepath (type="filepath")
-    still_video = make_still_video(avatar_img, audio_wav, fps=25)
-    result = latentsync_infer(still_video, audio_wav, inference_steps, guidance_scale, seed)
-    return result
-with gr.Blocks(title="LatentSync (avatar.jpg + audio.wav → lip-sync mp4)") as demo:
-    gr.Markdown(
-        """
-# LatentSync (HF Space)
-Upload **avatar.jpg** + **audio.wav** → get lip-sync **mp4**.
-(Uses **LatentSync 1.5** to fit **T4 16GB VRAM**.)
-"""
-    )
-    with gr.Row():
-        avatar = gr.Image(label="Avatar Image (jpg/png)", type="filepath")
-        audio = gr.Audio(label="Audio (wav)", type="filepath")
     with gr.Row():
-        guidance = gr.Slider(1.0, 3.0, value=1.5, step=0.1, label="Guidance Scale")
-        steps = gr.Slider(10, 50, value=20, step=1, label="Inference Steps")
-        seed = gr.Number(value=1247, precision=0, label="Seed")
-    btn = gr.Button("Generate Lip-Sync Video")
-    out = gr.Video(label="Output MP4")
-    btn.click(fn=generate, inputs=[avatar, audio, steps, guidance, seed], outputs=out)
 demo.launch()

 import os
 import subprocess
 from pathlib import Path
 from datetime import datetime
 import gradio as gr
 from huggingface_hub import snapshot_download
 ROOT = Path(__file__).parent.resolve()
 REPO_DIR = ROOT / "LatentSync"
 TEMP_DIR = REPO_DIR / "temp"
+INPUT_DIR = REPO_DIR / "inputs"
+OUTPUT_DIR = REPO_DIR / "outputs"
+CKPT_DIR = REPO_DIR / "checkpoints"
 HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
 def run(cmd, cwd=None):
+    print(" ".join(map(str, cmd)))
     subprocess.check_call(cmd, cwd=cwd)
+def setup():
     if not REPO_DIR.exists():
         run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
+    CKPT_DIR.mkdir(parents=True, exist_ok=True)
     snapshot_download(
         repo_id=HF_CKPT_REPO,
+        local_dir=str(CKPT_DIR),
         local_dir_use_symlinks=False,
     )
+    INPUT_DIR.mkdir(parents=True, exist_ok=True)
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
     TEMP_DIR.mkdir(parents=True, exist_ok=True)
+def make_still_video(img_path: str, audio_path: str, fps: int = 25) -> str:
+    out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
     cmd = [
         "ffmpeg", "-y",
+        "-loop", "1", "-i", img_path,
         "-i", audio_path,
         "-shortest",
         "-r", str(fps),
     run(cmd)
     return str(out_path)
+def generate(avatar_img, audio_wav):
+    setup()
+    img_path = str(Path(avatar_img).resolve())
+    wav_path = str(Path(audio_wav).resolve())
+    # create video from image+audio
+    still_video = make_still_video(img_path, wav_path, fps=25)
+    # run LatentSync inference (use repo script directly)
+    out_path = OUTPUT_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
+    # NOTE:
+    # LatentSync repo sometimes provides "gradio_app.py" or "predict.py" with different args.
+    # We call the official inference entry if available.
+    # If your build fails here, paste the Space logs and I’ll adjust to exact script/args.
+    cmd = [
+        "python", "predict.py",
+        "--image_path", img_path,
+        "--audio_path", wav_path,
+        "--output_path", str(out_path),
+    ]
+    # Some LatentSync versions require video instead of image; if this fails we’ll swap
+    # to their video-based inference script.
+    run(cmd, cwd=str(REPO_DIR))
     return str(out_path)
+with gr.Blocks() as demo:
+    gr.Markdown("# LatentSync (avatar.jpg + audio.wav → mp4)")
     with gr.Row():
+        avatar = gr.Image(type="filepath", label="avatar.jpg/png")
+        audio = gr.Audio(type="filepath", label="audio.wav", format="wav")
+    btn = gr.Button("Generate")
+    out = gr.Video(label="Output")
+    btn.click(generate, inputs=[avatar, audio], outputs=out)
 demo.launch()