Spaces:

banao-tech
/

model-testing

Build error

App Files Files Community

banao-tech commited on 28 days ago

Commit

eb63d21

verified ·

1 Parent(s): b606086

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -45

app.py CHANGED Viewed

@@ -5,11 +5,14 @@ from pathlib import Path
 from datetime import datetime
 import gradio as gr
 from huggingface_hub import snapshot_download
 ROOT = Path(__file__).parent.resolve()
 REPO_DIR = ROOT / "LatentSync"
 CKPT_DIR = REPO_DIR / "checkpoints"
 TEMP_DIR = REPO_DIR / "temp"
 # Use 1.5 on T4 16GB
 HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
@@ -18,15 +21,52 @@ def run(cmd, cwd=None):
     print(" ".join(map(str, cmd)))
     subprocess.check_call(cmd, cwd=cwd)
 def setup():
-    # Clone LatentSync repo at runtime (won't appear in HF Files tab)
     if not REPO_DIR.exists():
         run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
     CKPT_DIR.mkdir(parents=True, exist_ok=True)
     TEMP_DIR.mkdir(parents=True, exist_ok=True)
-    # Download all checkpoint files (includes latentsync_unet + whisper tiny/small etc)
     snapshot_download(
         repo_id=HF_CKPT_REPO,
         local_dir=str(CKPT_DIR),
@@ -55,53 +95,127 @@ def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
     return str(out_path)
 def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache):
-    setup()
-    img_path = str(Path(avatar_img).resolve())
-    wav_path = str(Path(audio_wav).resolve())
-    # Make a temp mp4 from the single image + audio
-    video_path = make_still_video(img_path, wav_path, fps=25)
-    out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
-    # FIXED: Use correct config path - configs/unet/stage2.yaml instead of configs/unet.yaml
-    cmd = [
-        "python", "-m", "scripts.inference",
-        "--unet_config_path", "configs/unet/stage2.yaml",  # ← FIXED PATH
-        "--inference_ckpt_path", "checkpoints/latentsync_unet.pt",
-        "--video_path", video_path,
-        "--audio_path", wav_path,
-        "--video_out_path", str(out_path),
-        "--inference_steps", str(int(steps)),
-        "--guidance_scale", str(float(guidance)),
-        "--seed", str(int(seed)),
-        "--temp_dir", "temp",
-    ]
-    if use_deepcache:
-        cmd.append("--enable_deepcache")
-    run(cmd, cwd=str(REPO_DIR))
-    return str(out_path)
-with gr.Blocks(title="LatentSync (avatar.jpg + audio.wav → lip-sync mp4)") as demo:
-    gr.Markdown("## LatentSync 1.5 on Hugging Face (T4) — Upload avatar + audio → mp4")
     with gr.Row():
-        avatar = gr.Image(type="filepath", label="Avatar image (jpg/png)")
-        audio = gr.Audio(type="filepath", label="Audio (wav)", format="wav")
-    with gr.Row():
-        steps = gr.Slider(10, 40, value=20, step=1, label="Inference Steps")
-        guidance = gr.Slider(0.8, 2.0, value=1.0, step=0.1, label="Guidance Scale")
-        seed = gr.Number(value=1247, precision=0, label="Seed")
-        deepcache = gr.Checkbox(value=True, label="Enable DeepCache (faster)")
-    btn = gr.Button("Generate")
-    out = gr.Video(label="Output video")
-    btn.click(generate, inputs=[avatar, audio, steps, guidance, seed, deepcache], outputs=out)
-demo.launch()

 from datetime import datetime
 import gradio as gr
 from huggingface_hub import snapshot_download
+import numpy as np
+from PIL import Image
 ROOT = Path(__file__).parent.resolve()
 REPO_DIR = ROOT / "LatentSync"
 CKPT_DIR = REPO_DIR / "checkpoints"
 TEMP_DIR = REPO_DIR / "temp"
+MASK_DIR = REPO_DIR / "latentsync" / "utils"
 # Use 1.5 on T4 16GB
 HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
     print(" ".join(map(str, cmd)))
     subprocess.check_call(cmd, cwd=cwd)
+def create_mask_image():
+    """
+    Create the missing mask.png file that LatentSync expects.
+    This creates a circular mask for the mouth region (lower half of face).
+    """
+    mask_path = MASK_DIR / "mask.png"
+    if mask_path.exists():
+        return  # Mask already exists
+    # Create the utils directory if it doesn't exist
+    MASK_DIR.mkdir(parents=True, exist_ok=True)
+    # Create a 256x256 mask image
+    # White (255) = area to be inpainted (mouth region)
+    # Black (0) = area to keep unchanged
+    height, width = 256, 256
+    mask = np.zeros((height, width), dtype=np.uint8)
+    # Create an elliptical mask for the lower face/mouth region
+    # This covers approximately the bottom third of the face
+    center_x, center_y = width // 2, int(height * 0.7)
+    radius_x, radius_y = int(width * 0.35), int(height * 0.25)
+    for y in range(height):
+        for x in range(width):
+            # Ellipse equation: ((x-cx)/rx)^2 + ((y-cy)/ry)^2 <= 1
+            if ((x - center_x) / radius_x) ** 2 + ((y - center_y) / radius_y) ** 2 <= 1:
+                mask[y, x] = 255
+    # Save the mask
+    mask_img = Image.fromarray(mask, mode='L')
+    mask_img.save(str(mask_path))
+    print(f"Created mask image at {mask_path}")
 def setup():
+    # Clone LatentSync repo at runtime
     if not REPO_DIR.exists():
         run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
     CKPT_DIR.mkdir(parents=True, exist_ok=True)
     TEMP_DIR.mkdir(parents=True, exist_ok=True)
+    # Create the missing mask.png file
+    create_mask_image()
+    # Download all checkpoint files
     snapshot_download(
         repo_id=HF_CKPT_REPO,
         local_dir=str(CKPT_DIR),
     return str(out_path)
 def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache):
+    try:
+        setup()
+        if avatar_img is None:
+            return None, "Please upload an avatar image!"
+        if audio_wav is None:
+            return None, "Please upload an audio file!"
+        img_path = str(Path(avatar_img).resolve())
+        wav_path = str(Path(audio_wav).resolve())
+        # Make a temp mp4 from the single image + audio
+        video_path = make_still_video(img_path, wav_path, fps=25)
+        out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
+        # Use correct config path for LatentSync 1.5
+        cmd = [
+            "python", "-m", "scripts.inference",
+            "--unet_config_path", "configs/unet/stage2.yaml",
+            "--inference_ckpt_path", "checkpoints/latentsync_unet.pt",
+            "--video_path", video_path,
+            "--audio_path", wav_path,
+            "--video_out_path", str(out_path),
+            "--inference_steps", str(int(steps)),
+            "--guidance_scale", str(float(guidance)),
+            "--seed", str(int(seed)),
+            "--temp_dir", "temp",
+        ]
+        if use_deepcache:
+            cmd.append("--enable_deepcache")
+        run(cmd, cwd=str(REPO_DIR))
+        if out_path.exists():
+            return str(out_path), "Video generated successfully!"
+        else:
+            return None, "Video generation failed - output file not created"
+    except subprocess.CalledProcessError as e:
+        error_msg = f"Command failed with return code {e.returncode}"
+        return None, error_msg
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+# Gradio Interface
+with gr.Blocks(title="LatentSync - Lip Sync Generator", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🎬 LatentSync 1.5 - AI Lip Sync Generator
+        Upload an avatar image and audio file to generate a lip-synced video!
+        **Tips:**
+        - Use clear frontal face images for best results
+        - Keep audio under 30 seconds for faster processing
+        - Higher inference steps = better quality but slower
+        """
+    )
     with gr.Row():
+        with gr.Column():
+            avatar = gr.Image(
+                type="filepath",
+                label="📷 Avatar Image",
+                info="Upload a clear frontal face photo (JPG/PNG)"
+            )
+            audio = gr.Audio(
+                type="filepath",
+                label="🎵 Audio File",
+                format="wav",
+                info="Upload your audio (WAV format recommended)"
+            )
+        with gr.Column():
+            with gr.Group():
+                gr.Markdown("### ⚙️ Generation Settings")
+                steps = gr.Slider(
+                    10, 40, value=20, step=1,
+                    label="Inference Steps",
+                    info="Higher = better quality, slower"
+                )
+                guidance = gr.Slider(
+                    0.8, 2.0, value=1.0, step=0.1,
+                    label="Guidance Scale",
+                    info="Higher = better lip sync, may distort"
+                )
+                seed = gr.Number(
+                    value=1247, precision=0,
+                    label="Seed",
+                    info="For reproducible results"
+                )
+                deepcache = gr.Checkbox(
+                    value=True,
+                    label="Enable DeepCache (Faster)",
+                    info="Recommended for T4 GPU"
+                )
+    btn = gr.Button("🚀 Generate Lip-Synced Video", variant="primary", size="lg")
+    status = gr.Textbox(label="Status", interactive=False)
+    out = gr.Video(label="Generated Video")
+    btn.click(
+        generate,
+        inputs=[avatar, audio, steps, guidance, seed, deepcache],
+        outputs=[out, status]
+    )
+    gr.Markdown(
+        """
+        ---
+        ### 📝 Notes:
+        - First run will download models (~7GB) - this may take a few minutes
+        - Generation takes 30-90 seconds depending on settings
+        - Works best with T4 GPU (16GB)
+        - Based on [LatentSync by ByteDance](https://github.com/bytedance/LatentSync)
+        """
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=3)
+    demo.launch(server_name="0.0.0.0", server_port=7860)