Latent audio lipsync

#1
by Monarch-1 - opened
README.md CHANGED
@@ -4,10 +4,9 @@ emoji: 👄
4
  colorFrom: blue
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 6.13.0
8
  app_file: app.py
9
  pinned: false
10
- disable_embedding: true
11
  short_description: Audio Conditioned LipSync with Latent Diffusion Models
12
  ---
13
 
 
4
  colorFrom: blue
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.12.0
8
  app_file: app.py
9
  pinned: false
 
10
  short_description: Audio Conditioned LipSync with Latent Diffusion Models
11
  ---
12
 
app.py CHANGED
@@ -1,33 +1,4 @@
1
- def _patch_asyncio_event_loop_del():
2
- """
3
- Patch a noisy asyncio teardown issue sometimes seen in Spaces environments.
4
- In some runtime/container combinations, Python may try to close an already
5
- invalid file descriptor when the event loop is garbage-collected. We silence
6
- only that specific harmless case.
7
- """
8
- try:
9
- import asyncio.base_events as base_events
10
-
11
- original_del = getattr(base_events.BaseEventLoop, "__del__", None)
12
- if original_del is None:
13
- return
14
-
15
- def patched_del(self):
16
- try:
17
- original_del(self)
18
- except ValueError as e:
19
- if "Invalid file descriptor" not in str(e):
20
- raise
21
-
22
- base_events.BaseEventLoop.__del__ = patched_del
23
- except Exception:
24
- pass
25
-
26
-
27
- _patch_asyncio_event_loop_del()
28
-
29
  import gradio as gr
30
- import spaces
31
  import os
32
  import sys
33
  import shutil
@@ -40,7 +11,7 @@ from huggingface_hub import snapshot_download
40
  os.makedirs("checkpoints", exist_ok=True)
41
 
42
  snapshot_download(
43
- repo_id = "ByteDance/LatentSync",
44
  local_dir = "./checkpoints"
45
  )
46
 
@@ -108,26 +79,7 @@ from accelerate.utils import set_seed
108
  from latentsync.whisper.audio2feature import Audio2Feature
109
 
110
 
111
- @spaces.GPU(duration=180)
112
  def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
113
- """
114
- Perform lip-sync video generation using an input video and a separate audio track.
115
-
116
- This function takes an input video (usually a person speaking) and an audio file,
117
- and synchronizes the video frames so that the lips of the speaker match the audio content.
118
- It uses a latent diffusion model-based pipeline (LatentSync) for audio-conditioned lip synchronization.
119
-
120
- Args:
121
- video_path (str): File path to the input video in MP4 format.
122
- audio_path (str): File path to the input audio file (e.g., WAV or MP3).
123
- progress (gr.Progress, optional): Gradio progress tracker for UI feedback (auto-injected).
124
-
125
- Returns:
126
- str: File path to the generated output video with lip synchronization applied.
127
- """
128
-
129
- gr.Info("180 seconds will be used from your daily ZeroGPU time credits.")
130
-
131
  inference_ckpt_path = "checkpoints/latentsync_unet.pt"
132
  unet_config_path = "configs/unet/second_stage.yaml"
133
  config = OmegaConf.load(unet_config_path)
@@ -171,12 +123,9 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
171
 
172
  unet = unet.to(dtype=torch.float16)
173
 
174
- """
175
  # set xformers
176
-
177
  if is_xformers_available():
178
  unet.enable_xformers_memory_efficient_attention()
179
- """
180
 
181
  pipeline = LipsyncPipeline(
182
  vae=vae,
@@ -267,4 +216,4 @@ with gr.Blocks(css=css) as demo:
267
  outputs = [video_result]
268
  )
269
 
270
- demo.queue().launch(show_error=True, ssr_mode=False, mcp_server=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import os
3
  import sys
4
  import shutil
 
11
  os.makedirs("checkpoints", exist_ok=True)
12
 
13
  snapshot_download(
14
+ repo_id = "chunyu-li/LatentSync",
15
  local_dir = "./checkpoints"
16
  )
17
 
 
79
  from latentsync.whisper.audio2feature import Audio2Feature
80
 
81
 
 
82
  def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  inference_ckpt_path = "checkpoints/latentsync_unet.pt"
84
  unet_config_path = "configs/unet/second_stage.yaml"
85
  config = OmegaConf.load(unet_config_path)
 
123
 
124
  unet = unet.to(dtype=torch.float16)
125
 
 
126
  # set xformers
 
127
  if is_xformers_available():
128
  unet.enable_xformers_memory_efficient_attention()
 
129
 
130
  pipeline = LipsyncPipeline(
131
  vae=vae,
 
216
  outputs = [video_result]
217
  )
218
 
219
+ demo.queue().launch(show_api=False, show_error=True)
latentsync/models/attention.py CHANGED
@@ -9,10 +9,10 @@ import torch.nn.functional as F
9
  from torch import nn
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
- from diffusers.models.modeling_utils import ModelMixin
13
  from diffusers.utils import BaseOutput
14
  from diffusers.utils.import_utils import is_xformers_available
15
- from diffusers.models.attention import Attention as CrossAttention, FeedForward, AdaLayerNorm
16
 
17
  from einops import rearrange, repeat
18
  from .utils import zero_module
 
9
  from torch import nn
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
+ from diffusers.modeling_utils import ModelMixin
13
  from diffusers.utils import BaseOutput
14
  from diffusers.utils.import_utils import is_xformers_available
15
+ from diffusers.models.attention import CrossAttention, FeedForward, AdaLayerNorm
16
 
17
  from einops import rearrange, repeat
18
  from .utils import zero_module
latentsync/models/motion_module.py CHANGED
@@ -11,10 +11,10 @@ import torch.nn.functional as F
11
  from torch import nn
12
 
13
  from diffusers.configuration_utils import ConfigMixin, register_to_config
14
- from diffusers.models.modeling_utils import ModelMixin
15
  from diffusers.utils import BaseOutput
16
  from diffusers.utils.import_utils import is_xformers_available
17
- from diffusers.models.attention import Attention as CrossAttention, FeedForward
18
 
19
  from einops import rearrange, repeat
20
  import math
 
11
  from torch import nn
12
 
13
  from diffusers.configuration_utils import ConfigMixin, register_to_config
14
+ from diffusers.modeling_utils import ModelMixin
15
  from diffusers.utils import BaseOutput
16
  from diffusers.utils.import_utils import is_xformers_available
17
+ from diffusers.models.attention import CrossAttention, FeedForward
18
 
19
  from einops import rearrange, repeat
20
  import math
latentsync/models/unet.py CHANGED
@@ -9,7 +9,7 @@ import torch.nn as nn
9
  import torch.utils.checkpoint
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
- from diffusers.models.modeling_utils import ModelMixin
13
  from diffusers import UNet2DConditionModel
14
  from diffusers.utils import BaseOutput, logging
15
  from diffusers.models.embeddings import TimestepEmbedding, Timesteps
 
9
  import torch.utils.checkpoint
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
+ from diffusers.modeling_utils import ModelMixin
13
  from diffusers import UNet2DConditionModel
14
  from diffusers.utils import BaseOutput, logging
15
  from diffusers.models.embeddings import TimestepEmbedding, Timesteps
latentsync/pipelines/lipsync_pipeline.py CHANGED
@@ -15,7 +15,7 @@ from packaging import version
15
 
16
  from diffusers.configuration_utils import FrozenDict
17
  from diffusers.models import AutoencoderKL
18
- from diffusers.pipelines.pipeline_utils import DiffusionPipeline
19
  from diffusers.schedulers import (
20
  DDIMScheduler,
21
  DPMSolverMultistepScheduler,
 
15
 
16
  from diffusers.configuration_utils import FrozenDict
17
  from diffusers.models import AutoencoderKL
18
+ from diffusers.pipeline_utils import DiffusionPipeline
19
  from diffusers.schedulers import (
20
  DDIMScheduler,
21
  DPMSolverMultistepScheduler,
requirements.txt CHANGED
@@ -1,21 +1,21 @@
1
- torch==2.5.1
2
- torchvision==0.20.1
3
  --extra-index-url https://download.pytorch.org/whl/cu121
4
- xformers==0.0.29.post1
5
- triton==3.1.0
6
 
7
- diffusers==0.33.1
8
- transformers==4.52.3
9
- huggingface-hub<1.0
10
  imageio==2.27.0
11
  decord==0.6.0
12
  accelerate==0.26.1
13
  einops==0.7.0
14
  omegaconf==2.3.0
15
- safetensors>=0.4.3
16
  opencv-python==4.9.0.80
17
  mediapipe==0.10.11
18
- av
19
  torch-fidelity==0.3.0
20
  torchmetrics==1.3.1
21
  python_speech_features==0.6
@@ -27,8 +27,5 @@ face-alignment==1.4.1
27
  ninja==1.11.1.1
28
  pandas==2.0.3
29
  numpy==1.24.4
30
- pydub==0.25.1
31
- moviepy==1.0.3
32
- hf-xet==1.1.8
33
- spaces
34
- gradio[mcp]
 
1
+ torch==2.2.2
2
+ torchvision==0.17.2
3
  --extra-index-url https://download.pytorch.org/whl/cu121
4
+ xformers==0.0.26
5
+ triton==2.2.0
6
 
7
+ diffusers==0.11.1
8
+ transformers==4.38.0
9
+ huggingface-hub==0.25.2
10
  imageio==2.27.0
11
  decord==0.6.0
12
  accelerate==0.26.1
13
  einops==0.7.0
14
  omegaconf==2.3.0
15
+ safetensors==0.4.2
16
  opencv-python==4.9.0.80
17
  mediapipe==0.10.11
18
+ av==11.0.0
19
  torch-fidelity==0.3.0
20
  torchmetrics==1.3.1
21
  python_speech_features==0.6
 
27
  ninja==1.11.1.1
28
  pandas==2.0.3
29
  numpy==1.24.4
30
+ pydub
31
+ moviepy==1.0.3