Spaces:

fffiloni
/

LatentSync

Running on Zero

App Files Files Community

Latent audio lipsync

by Monarch-1 - opened Mar 26, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+84

-158

This PR is in draft mode

Files changed (7) hide show

README.md +1 -2
app.py +66 -136
latentsync/models/attention.py +2 -2
latentsync/models/motion_module.py +2 -2
latentsync/models/unet.py +1 -1
latentsync/pipelines/lipsync_pipeline.py +1 -1
requirements.txt +11 -14

README.md CHANGED Viewed

@@ -4,10 +4,9 @@ emoji: 👄
 colorFrom: blue
 colorTo: blue
 sdk: gradio
-sdk_version: 6.13.0
 app_file: app.py
 pinned: false
-disable_embedding: true
 short_description: Audio Conditioned LipSync with Latent Diffusion Models
 ---

 colorFrom: blue
 colorTo: blue
 sdk: gradio
+sdk_version: 5.12.0
 app_file: app.py
 pinned: false
 short_description: Audio Conditioned LipSync with Latent Diffusion Models
 ---

app.py CHANGED Viewed

@@ -1,95 +1,73 @@
-def _patch_asyncio_event_loop_del():
-    """
-    Patch a noisy asyncio teardown issue sometimes seen in Spaces environments.
-    In some runtime/container combinations, Python may try to close an already
-    invalid file descriptor when the event loop is garbage-collected. We silence
-    only that specific harmless case.
-    """
-    try:
-        import asyncio.base_events as base_events
-        original_del = getattr(base_events.BaseEventLoop, "__del__", None)
-        if original_del is None:
-            return
-        def patched_del(self):
-            try:
-                original_del(self)
-            except ValueError as e:
-                if "Invalid file descriptor" not in str(e):
-                    raise
-        base_events.BaseEventLoop.__del__ = patched_del
-    except Exception:
-        pass
-_patch_asyncio_event_loop_del()
 import gradio as gr
-import spaces
 import os
 import sys
 import shutil
 import uuid
 import subprocess
 from glob import glob
 from huggingface_hub import snapshot_download
 os.makedirs("checkpoints", exist_ok=True)
 snapshot_download(
-    repo_id="ByteDance/LatentSync",
-    local_dir="./checkpoints",
 )
 import tempfile
 from moviepy.editor import VideoFileClip
 from pydub import AudioSegment
 def process_video(input_video_path, temp_dir="temp_dir"):
     """
     Crop a given MP4 video to a maximum duration of 10 seconds if it is longer than 10 seconds.
     Args:
         input_video_path (str): Path to the input video file.
         temp_dir (str): Directory where the processed video will be saved.
     Returns:
         str: Path to the cropped video file.
     """
     os.makedirs(temp_dir, exist_ok=True)
     video = VideoFileClip(input_video_path)
     input_file_name = os.path.basename(input_video_path)
     output_video_path = os.path.join(temp_dir, f"cropped_{input_file_name}")
     if video.duration > 10:
         video = video.subclip(0, 10)
     video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
     return output_video_path
 def process_audio(file_path, temp_dir):
     audio = AudioSegment.from_file(file_path)
-    max_duration = 8 * 1000
     if len(audio) > max_duration:
         audio = audio[:max_duration]
     output_path = os.path.join(temp_dir, "trimmed_audio.wav")
     audio.export(output_path, format="wav")
     print(f"Processed audio saved at: {output_path}")
     return output_path
 import argparse
 from omegaconf import OmegaConf
 import torch
@@ -101,54 +79,26 @@ from accelerate.utils import set_seed
 from latentsync.whisper.audio2feature import Audio2Feature
-@spaces.GPU(duration=180)
-def generate_lip_sync_video(
-    input_video_path: str,
-    input_audio_path: str,
-    progress=gr.Progress(track_tqdm=True),
-) -> str:
-    """
-    Generate a lip-synced video from an input video and a separate audio track.
-    Use this tool when you need to synchronize a visible speaker's mouth movement to match a provided audio file.
-    Args:
-        input_video_path (str): File path to the input MP4 video containing the visible speaker.
-        input_audio_path (str): File path to the input audio file used to drive lip synchronization.
-    Returns:
-        str: File path to the generated lip-synced MP4 video.
-    Raises:
-        NotImplementedError: Raised when the model cross-attention dimension is unsupported.
-    Important:
-        Input video is cropped to 10 seconds and input audio is trimmed to 8 seconds before generation.
-    """
-    gr.Info("180 seconds will be used from your daily ZeroGPU time credits.")
     inference_ckpt_path = "checkpoints/latentsync_unet.pt"
     unet_config_path = "configs/unet/second_stage.yaml"
     config = OmegaConf.load(unet_config_path)
-    print(f"Input video path: {input_video_path}")
-    print(f"Input audio path: {input_audio_path}")
     print(f"Loaded checkpoint path: {inference_ckpt_path}")
-    is_shared_ui = True if "fffiloni/LatentSync" in os.environ["SPACE_ID"] else False
     temp_dir = None
     if is_shared_ui:
         temp_dir = tempfile.mkdtemp()
-    cropped_video_path = process_video(input_video_path)
-    print(f"Cropped video saved to: {cropped_video_path}")
-    input_video_path = cropped_video_path
-    trimmed_audio_path = process_audio(input_audio_path, temp_dir)
-    print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
-    input_audio_path = trimmed_audio_path
     scheduler = DDIMScheduler.from_pretrained("configs")
@@ -159,31 +109,23 @@ def generate_lip_sync_video(
     else:
         raise NotImplementedError("cross_attention_dim must be 768 or 384")
-    audio_encoder = Audio2Feature(
-        model_path=whisper_model_path,
-        device="cuda",
-        num_frames=config.data.num_frames,
-    )
-    vae = AutoencoderKL.from_pretrained(
-        "stabilityai/sd-vae-ft-mse",
-        torch_dtype=torch.float16,
-    )
     vae.config.scaling_factor = 0.18215
     vae.config.shift_factor = 0
     unet, _ = UNet3DConditionModel.from_pretrained(
         OmegaConf.to_container(config.model),
-        inference_ckpt_path,
         device="cpu",
     )
     unet = unet.to(dtype=torch.float16)
-    """
     # set xformers
     if is_xformers_available():
         unet.enable_xformers_memory_efficient_attention()
-    """
     pipeline = LipsyncPipeline(
         vae=vae,
@@ -204,8 +146,8 @@ def generate_lip_sync_video(
     video_out_path = f"video_out{unique_id}.mp4"
     pipeline(
-        video_path=input_video_path,
-        audio_path=input_audio_path,
         video_out_path=video_out_path,
         video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
         num_frames=config.data.num_frames,
@@ -217,6 +159,7 @@ def generate_lip_sync_video(
     )
     if is_shared_ui:
         if os.path.exists(temp_dir):
             shutil.rmtree(temp_dir)
             print(f"Temporary directory {temp_dir} deleted.")
@@ -224,21 +167,16 @@ def generate_lip_sync_video(
     return video_out_path
-css = """
 div#col-container{
     margin: 0 auto;
     max-width: 982px;
 }
 """
-with gr.Blocks() as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown("# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync")
-        gr.Markdown(
-            "LatentSync, an end-to-end lip sync framework based on audio conditioned latent diffusion models "
-            "without any intermediate motion representation, diverging from previous diffusion-based lip sync "
-            "methods based on pixel space diffusion or two-stage generation."
-        )
         gr.HTML("""
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/bytedance/LatentSync">
@@ -247,43 +185,35 @@ with gr.Blocks() as demo:
             <a href="https://arxiv.org/abs/2412.09262">
                 <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
             </a>
-            <a href="https://huggingface.co/ByteDance/LatentSync">
-                <img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-yellow'>
             </a>
-            <a href="https://github.com/bytedance/LatentSync/blob/main/LICENSE">
-                <img src='https://img.shields.io/badge/License-Apache%202.0-green'>
             </a>
         </div>
         """)
         with gr.Row():
             with gr.Column():
                 video_input = gr.Video(label="Video Control", format="mp4")
                 audio_input = gr.Audio(label="Audio Input", type="filepath")
                 submit_btn = gr.Button("Submit")
             with gr.Column():
                 video_result = gr.Video(label="Result")
-        gr.Examples(
-            examples=[
-                ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
-                ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
-                ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
-            ],
-            inputs=[video_input, audio_input],
-        )
-        submit_btn.click(
-            fn=generate_lip_sync_video,
-            inputs=[video_input, audio_input],
-            outputs=[video_result],
-            api_visibility="public",
-        )
-demo.queue().launch(
-    css=css,
-    show_error=True,
-    ssr_mode=False,
-    mcp_server=True,
-)

 import gradio as gr
 import os
 import sys
 import shutil
 import uuid
 import subprocess
 from glob import glob
 from huggingface_hub import snapshot_download
+# Download models
 os.makedirs("checkpoints", exist_ok=True)
 snapshot_download(
+    repo_id = "chunyu-li/LatentSync",
+    local_dir = "./checkpoints"
 )
 import tempfile
 from moviepy.editor import VideoFileClip
 from pydub import AudioSegment
 def process_video(input_video_path, temp_dir="temp_dir"):
     """
     Crop a given MP4 video to a maximum duration of 10 seconds if it is longer than 10 seconds.
+    Save the new video in the specified folder (default is temp_dir).
     Args:
         input_video_path (str): Path to the input video file.
         temp_dir (str): Directory where the processed video will be saved.
     Returns:
         str: Path to the cropped video file.
     """
+    # Ensure the temp_dir exists
     os.makedirs(temp_dir, exist_ok=True)
+    # Load the video
     video = VideoFileClip(input_video_path)
+    # Determine the output path
     input_file_name = os.path.basename(input_video_path)
     output_video_path = os.path.join(temp_dir, f"cropped_{input_file_name}")
+    # Crop the video to 10 seconds if necessary
     if video.duration > 10:
         video = video.subclip(0, 10)
+    # Write the cropped video to the output path
     video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
+    # Return the path to the cropped video
     return output_video_path
 def process_audio(file_path, temp_dir):
+    # Load the audio file
     audio = AudioSegment.from_file(file_path)
+    # Check and cut the audio if longer than 4 seconds
+    max_duration = 8 * 1000  # 4 seconds in milliseconds
     if len(audio) > max_duration:
         audio = audio[:max_duration]
+    # Save the processed audio in the temporary directory
     output_path = os.path.join(temp_dir, "trimmed_audio.wav")
     audio.export(output_path, format="wav")
+    # Return the path to the trimmed file
     print(f"Processed audio saved at: {output_path}")
     return output_path
 import argparse
 from omegaconf import OmegaConf
 import torch
 from latentsync.whisper.audio2feature import Audio2Feature
+def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
     inference_ckpt_path = "checkpoints/latentsync_unet.pt"
     unet_config_path = "configs/unet/second_stage.yaml"
     config = OmegaConf.load(unet_config_path)
+    print(f"Input video path: {video_path}")
+    print(f"Input audio path: {audio_path}")
     print(f"Loaded checkpoint path: {inference_ckpt_path}")
+    is_shared_ui = True if "fffiloni/LatentSync" in os.environ['SPACE_ID'] else False
     temp_dir = None
     if is_shared_ui:
         temp_dir = tempfile.mkdtemp()
+        cropped_video_path = process_video(video_path)
+        print(f"Cropped video saved to: {cropped_video_path}")
+        video_path=cropped_video_path
+        trimmed_audio_path = process_audio(audio_path, temp_dir)
+        print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
+        audio_path=trimmed_audio_path
     scheduler = DDIMScheduler.from_pretrained("configs")
     else:
         raise NotImplementedError("cross_attention_dim must be 768 or 384")
+    audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
+    vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
     vae.config.scaling_factor = 0.18215
     vae.config.shift_factor = 0
     unet, _ = UNet3DConditionModel.from_pretrained(
         OmegaConf.to_container(config.model),
+        inference_ckpt_path,  # load checkpoint
         device="cpu",
     )
     unet = unet.to(dtype=torch.float16)
     # set xformers
     if is_xformers_available():
         unet.enable_xformers_memory_efficient_attention()
     pipeline = LipsyncPipeline(
         vae=vae,
     video_out_path = f"video_out{unique_id}.mp4"
     pipeline(
+        video_path=video_path,
+        audio_path=audio_path,
         video_out_path=video_out_path,
         video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
         num_frames=config.data.num_frames,
     )
     if is_shared_ui:
+        # Clean up the temporary directory
         if os.path.exists(temp_dir):
             shutil.rmtree(temp_dir)
             print(f"Temporary directory {temp_dir} deleted.")
     return video_out_path
+css="""
 div#col-container{
     margin: 0 auto;
     max-width: 982px;
 }
 """
+with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown("# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync")
+        gr.Markdown("LatentSync, an end-to-end lip sync framework based on audio conditioned latent diffusion models without any intermediate motion representation, diverging from previous diffusion-based lip sync methods based on pixel space diffusion or two-stage generation.")
         gr.HTML("""
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/bytedance/LatentSync">
             <a href="https://arxiv.org/abs/2412.09262">
                 <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
             </a>
+            <a href="https://huggingface.co/spaces/fffiloni/LatentSync?duplicate=true">
+                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
             </a>
+            <a href="https://huggingface.co/fffiloni">
+                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg" alt="Follow me on HF">
             </a>
         </div>
         """)
         with gr.Row():
             with gr.Column():
                 video_input = gr.Video(label="Video Control", format="mp4")
                 audio_input = gr.Audio(label="Audio Input", type="filepath")
                 submit_btn = gr.Button("Submit")
             with gr.Column():
                 video_result = gr.Video(label="Result")
+                gr.Examples(
+                    examples = [
+                        ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
+                        ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
+                        ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
+                    ],
+                    inputs = [video_input, audio_input]
+                )
+    submit_btn.click(
+        fn = main,
+        inputs = [video_input, audio_input],
+        outputs = [video_result]
+    )
+demo.queue().launch(show_api=False, show_error=True)

latentsync/models/attention.py CHANGED Viewed

@@ -9,10 +9,10 @@ import torch.nn.functional as F
 from torch import nn
 from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.models.attention import Attention as CrossAttention, FeedForward, AdaLayerNorm
 from einops import rearrange, repeat
 from .utils import zero_module

 from torch import nn
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
 from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention import CrossAttention, FeedForward, AdaLayerNorm
 from einops import rearrange, repeat
 from .utils import zero_module

latentsync/models/motion_module.py CHANGED Viewed

@@ -11,10 +11,10 @@ import torch.nn.functional as F
 from torch import nn
 from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.models.attention import Attention as CrossAttention, FeedForward
 from einops import rearrange, repeat
 import math

 from torch import nn
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
 from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention import CrossAttention, FeedForward
 from einops import rearrange, repeat
 import math

latentsync/models/unet.py CHANGED Viewed

@@ -9,7 +9,7 @@ import torch.nn as nn
 import torch.utils.checkpoint
 from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.modeling_utils import ModelMixin
 from diffusers import UNet2DConditionModel
 from diffusers.utils import BaseOutput, logging
 from diffusers.models.embeddings import TimestepEmbedding, Timesteps

 import torch.utils.checkpoint
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.modeling_utils import ModelMixin
 from diffusers import UNet2DConditionModel
 from diffusers.utils import BaseOutput, logging
 from diffusers.models.embeddings import TimestepEmbedding, Timesteps

latentsync/pipelines/lipsync_pipeline.py CHANGED Viewed

@@ -15,7 +15,7 @@ from packaging import version
 from diffusers.configuration_utils import FrozenDict
 from diffusers.models import AutoencoderKL
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import (
     DDIMScheduler,
     DPMSolverMultistepScheduler,

 from diffusers.configuration_utils import FrozenDict
 from diffusers.models import AutoencoderKL
+from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import (
     DDIMScheduler,
     DPMSolverMultistepScheduler,

requirements.txt CHANGED Viewed

@@ -1,21 +1,21 @@
-torch==2.5.1
-torchvision==0.20.1
 --extra-index-url https://download.pytorch.org/whl/cu121
-xformers==0.0.29.post1
-triton==3.1.0
-diffusers==0.33.1
-transformers==4.52.3
-huggingface-hub<1.0
 imageio==2.27.0
 decord==0.6.0
 accelerate==0.26.1
 einops==0.7.0
 omegaconf==2.3.0
-safetensors>=0.4.3
 opencv-python==4.9.0.80
 mediapipe==0.10.11
-av
 torch-fidelity==0.3.0
 torchmetrics==1.3.1
 python_speech_features==0.6
@@ -27,8 +27,5 @@ face-alignment==1.4.1
 ninja==1.11.1.1
 pandas==2.0.3
 numpy==1.24.4
-pydub==0.25.1
-moviepy==1.0.3
-hf-xet==1.1.8
-spaces
-gradio[mcp]

+torch==2.2.2
+torchvision==0.17.2
 --extra-index-url https://download.pytorch.org/whl/cu121
+xformers==0.0.26
+triton==2.2.0
+diffusers==0.11.1
+transformers==4.38.0
+huggingface-hub==0.25.2
 imageio==2.27.0
 decord==0.6.0
 accelerate==0.26.1
 einops==0.7.0
 omegaconf==2.3.0
+safetensors==0.4.2
 opencv-python==4.9.0.80
 mediapipe==0.10.11
+av==11.0.0
 torch-fidelity==0.3.0
 torchmetrics==1.3.1
 python_speech_features==0.6
 ninja==1.11.1.1
 pandas==2.0.3
 numpy==1.24.4
+pydub
+moviepy==1.0.3