Latent audio lipsync

#1
by Monarch-1 - opened
README.md CHANGED
@@ -4,10 +4,9 @@ emoji: 👄
4
  colorFrom: blue
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 6.13.0
8
  app_file: app.py
9
  pinned: false
10
- disable_embedding: true
11
  short_description: Audio Conditioned LipSync with Latent Diffusion Models
12
  ---
13
 
 
4
  colorFrom: blue
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.12.0
8
  app_file: app.py
9
  pinned: false
 
10
  short_description: Audio Conditioned LipSync with Latent Diffusion Models
11
  ---
12
 
app.py CHANGED
@@ -1,95 +1,73 @@
1
- def _patch_asyncio_event_loop_del():
2
- """
3
- Patch a noisy asyncio teardown issue sometimes seen in Spaces environments.
4
- In some runtime/container combinations, Python may try to close an already
5
- invalid file descriptor when the event loop is garbage-collected. We silence
6
- only that specific harmless case.
7
- """
8
- try:
9
- import asyncio.base_events as base_events
10
-
11
- original_del = getattr(base_events.BaseEventLoop, "__del__", None)
12
- if original_del is None:
13
- return
14
-
15
- def patched_del(self):
16
- try:
17
- original_del(self)
18
- except ValueError as e:
19
- if "Invalid file descriptor" not in str(e):
20
- raise
21
-
22
- base_events.BaseEventLoop.__del__ = patched_del
23
- except Exception:
24
- pass
25
-
26
-
27
- _patch_asyncio_event_loop_del()
28
-
29
-
30
  import gradio as gr
31
- import spaces
32
  import os
33
  import sys
34
  import shutil
35
  import uuid
36
  import subprocess
37
  from glob import glob
38
-
39
  from huggingface_hub import snapshot_download
40
 
 
41
  os.makedirs("checkpoints", exist_ok=True)
 
42
  snapshot_download(
43
- repo_id="ByteDance/LatentSync",
44
- local_dir="./checkpoints",
45
  )
46
 
47
  import tempfile
48
  from moviepy.editor import VideoFileClip
49
  from pydub import AudioSegment
50
 
51
-
52
  def process_video(input_video_path, temp_dir="temp_dir"):
53
  """
54
  Crop a given MP4 video to a maximum duration of 10 seconds if it is longer than 10 seconds.
55
-
 
56
  Args:
57
  input_video_path (str): Path to the input video file.
58
  temp_dir (str): Directory where the processed video will be saved.
59
-
60
  Returns:
61
  str: Path to the cropped video file.
62
  """
 
63
  os.makedirs(temp_dir, exist_ok=True)
64
-
 
65
  video = VideoFileClip(input_video_path)
66
-
 
67
  input_file_name = os.path.basename(input_video_path)
68
  output_video_path = os.path.join(temp_dir, f"cropped_{input_file_name}")
69
-
 
70
  if video.duration > 10:
71
  video = video.subclip(0, 10)
72
-
 
73
  video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
74
-
 
75
  return output_video_path
76
 
77
-
78
  def process_audio(file_path, temp_dir):
 
79
  audio = AudioSegment.from_file(file_path)
80
-
81
- max_duration = 8 * 1000
82
-
83
  if len(audio) > max_duration:
84
  audio = audio[:max_duration]
85
-
 
86
  output_path = os.path.join(temp_dir, "trimmed_audio.wav")
87
  audio.export(output_path, format="wav")
88
-
 
89
  print(f"Processed audio saved at: {output_path}")
90
  return output_path
91
 
92
-
93
  import argparse
94
  from omegaconf import OmegaConf
95
  import torch
@@ -101,54 +79,26 @@ from accelerate.utils import set_seed
101
  from latentsync.whisper.audio2feature import Audio2Feature
102
 
103
 
104
- @spaces.GPU(duration=180)
105
- def generate_lip_sync_video(
106
- input_video_path: str,
107
- input_audio_path: str,
108
- progress=gr.Progress(track_tqdm=True),
109
- ) -> str:
110
- """
111
- Generate a lip-synced video from an input video and a separate audio track.
112
-
113
- Use this tool when you need to synchronize a visible speaker's mouth movement to match a provided audio file.
114
-
115
- Args:
116
- input_video_path (str): File path to the input MP4 video containing the visible speaker.
117
- input_audio_path (str): File path to the input audio file used to drive lip synchronization.
118
-
119
- Returns:
120
- str: File path to the generated lip-synced MP4 video.
121
-
122
- Raises:
123
- NotImplementedError: Raised when the model cross-attention dimension is unsupported.
124
-
125
- Important:
126
- Input video is cropped to 10 seconds and input audio is trimmed to 8 seconds before generation.
127
- """
128
- gr.Info("180 seconds will be used from your daily ZeroGPU time credits.")
129
-
130
  inference_ckpt_path = "checkpoints/latentsync_unet.pt"
131
  unet_config_path = "configs/unet/second_stage.yaml"
132
-
133
  config = OmegaConf.load(unet_config_path)
134
-
135
- print(f"Input video path: {input_video_path}")
136
- print(f"Input audio path: {input_audio_path}")
137
  print(f"Loaded checkpoint path: {inference_ckpt_path}")
138
 
139
- is_shared_ui = True if "fffiloni/LatentSync" in os.environ["SPACE_ID"] else False
140
-
141
  temp_dir = None
142
  if is_shared_ui:
143
  temp_dir = tempfile.mkdtemp()
 
 
 
144
 
145
- cropped_video_path = process_video(input_video_path)
146
- print(f"Cropped video saved to: {cropped_video_path}")
147
- input_video_path = cropped_video_path
148
-
149
- trimmed_audio_path = process_audio(input_audio_path, temp_dir)
150
- print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
151
- input_audio_path = trimmed_audio_path
152
 
153
  scheduler = DDIMScheduler.from_pretrained("configs")
154
 
@@ -159,31 +109,23 @@ def generate_lip_sync_video(
159
  else:
160
  raise NotImplementedError("cross_attention_dim must be 768 or 384")
161
 
162
- audio_encoder = Audio2Feature(
163
- model_path=whisper_model_path,
164
- device="cuda",
165
- num_frames=config.data.num_frames,
166
- )
167
 
168
- vae = AutoencoderKL.from_pretrained(
169
- "stabilityai/sd-vae-ft-mse",
170
- torch_dtype=torch.float16,
171
- )
172
  vae.config.scaling_factor = 0.18215
173
  vae.config.shift_factor = 0
174
 
175
  unet, _ = UNet3DConditionModel.from_pretrained(
176
  OmegaConf.to_container(config.model),
177
- inference_ckpt_path,
178
  device="cpu",
179
  )
 
180
  unet = unet.to(dtype=torch.float16)
181
 
182
- """
183
  # set xformers
184
  if is_xformers_available():
185
  unet.enable_xformers_memory_efficient_attention()
186
- """
187
 
188
  pipeline = LipsyncPipeline(
189
  vae=vae,
@@ -204,8 +146,8 @@ def generate_lip_sync_video(
204
  video_out_path = f"video_out{unique_id}.mp4"
205
 
206
  pipeline(
207
- video_path=input_video_path,
208
- audio_path=input_audio_path,
209
  video_out_path=video_out_path,
210
  video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
211
  num_frames=config.data.num_frames,
@@ -217,6 +159,7 @@ def generate_lip_sync_video(
217
  )
218
 
219
  if is_shared_ui:
 
220
  if os.path.exists(temp_dir):
221
  shutil.rmtree(temp_dir)
222
  print(f"Temporary directory {temp_dir} deleted.")
@@ -224,21 +167,16 @@ def generate_lip_sync_video(
224
  return video_out_path
225
 
226
 
227
- css = """
228
  div#col-container{
229
  margin: 0 auto;
230
  max-width: 982px;
231
  }
232
  """
233
-
234
- with gr.Blocks() as demo:
235
  with gr.Column(elem_id="col-container"):
236
  gr.Markdown("# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync")
237
- gr.Markdown(
238
- "LatentSync, an end-to-end lip sync framework based on audio conditioned latent diffusion models "
239
- "without any intermediate motion representation, diverging from previous diffusion-based lip sync "
240
- "methods based on pixel space diffusion or two-stage generation."
241
- )
242
  gr.HTML("""
243
  <div style="display:flex;column-gap:4px;">
244
  <a href="https://github.com/bytedance/LatentSync">
@@ -247,43 +185,35 @@ with gr.Blocks() as demo:
247
  <a href="https://arxiv.org/abs/2412.09262">
248
  <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
249
  </a>
250
- <a href="https://huggingface.co/ByteDance/LatentSync">
251
- <img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-yellow'>
252
  </a>
253
- <a href="https://github.com/bytedance/LatentSync/blob/main/LICENSE">
254
- <img src='https://img.shields.io/badge/License-Apache%202.0-green'>
255
  </a>
256
  </div>
257
  """)
258
-
259
  with gr.Row():
260
  with gr.Column():
261
  video_input = gr.Video(label="Video Control", format="mp4")
262
  audio_input = gr.Audio(label="Audio Input", type="filepath")
263
  submit_btn = gr.Button("Submit")
264
-
265
  with gr.Column():
266
  video_result = gr.Video(label="Result")
267
 
268
- gr.Examples(
269
- examples=[
270
- ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
271
- ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
272
- ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
273
- ],
274
- inputs=[video_input, audio_input],
275
- )
276
-
277
- submit_btn.click(
278
- fn=generate_lip_sync_video,
279
- inputs=[video_input, audio_input],
280
- outputs=[video_result],
281
- api_visibility="public",
282
- )
283
 
284
- demo.queue().launch(
285
- css=css,
286
- show_error=True,
287
- ssr_mode=False,
288
- mcp_server=True,
289
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import os
3
  import sys
4
  import shutil
5
  import uuid
6
  import subprocess
7
  from glob import glob
 
8
  from huggingface_hub import snapshot_download
9
 
10
+ # Download models
11
  os.makedirs("checkpoints", exist_ok=True)
12
+
13
  snapshot_download(
14
+ repo_id = "chunyu-li/LatentSync",
15
+ local_dir = "./checkpoints"
16
  )
17
 
18
  import tempfile
19
  from moviepy.editor import VideoFileClip
20
  from pydub import AudioSegment
21
 
 
22
  def process_video(input_video_path, temp_dir="temp_dir"):
23
  """
24
  Crop a given MP4 video to a maximum duration of 10 seconds if it is longer than 10 seconds.
25
+ Save the new video in the specified folder (default is temp_dir).
26
+
27
  Args:
28
  input_video_path (str): Path to the input video file.
29
  temp_dir (str): Directory where the processed video will be saved.
30
+
31
  Returns:
32
  str: Path to the cropped video file.
33
  """
34
+ # Ensure the temp_dir exists
35
  os.makedirs(temp_dir, exist_ok=True)
36
+
37
+ # Load the video
38
  video = VideoFileClip(input_video_path)
39
+
40
+ # Determine the output path
41
  input_file_name = os.path.basename(input_video_path)
42
  output_video_path = os.path.join(temp_dir, f"cropped_{input_file_name}")
43
+
44
+ # Crop the video to 10 seconds if necessary
45
  if video.duration > 10:
46
  video = video.subclip(0, 10)
47
+
48
+ # Write the cropped video to the output path
49
  video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
50
+
51
+ # Return the path to the cropped video
52
  return output_video_path
53
 
 
54
  def process_audio(file_path, temp_dir):
55
+ # Load the audio file
56
  audio = AudioSegment.from_file(file_path)
57
+
58
+ # Check and cut the audio if longer than 4 seconds
59
+ max_duration = 8 * 1000 # 4 seconds in milliseconds
60
  if len(audio) > max_duration:
61
  audio = audio[:max_duration]
62
+
63
+ # Save the processed audio in the temporary directory
64
  output_path = os.path.join(temp_dir, "trimmed_audio.wav")
65
  audio.export(output_path, format="wav")
66
+
67
+ # Return the path to the trimmed file
68
  print(f"Processed audio saved at: {output_path}")
69
  return output_path
70
 
 
71
  import argparse
72
  from omegaconf import OmegaConf
73
  import torch
 
79
  from latentsync.whisper.audio2feature import Audio2Feature
80
 
81
 
82
+ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  inference_ckpt_path = "checkpoints/latentsync_unet.pt"
84
  unet_config_path = "configs/unet/second_stage.yaml"
 
85
  config = OmegaConf.load(unet_config_path)
86
+
87
+ print(f"Input video path: {video_path}")
88
+ print(f"Input audio path: {audio_path}")
89
  print(f"Loaded checkpoint path: {inference_ckpt_path}")
90
 
91
+ is_shared_ui = True if "fffiloni/LatentSync" in os.environ['SPACE_ID'] else False
 
92
  temp_dir = None
93
  if is_shared_ui:
94
  temp_dir = tempfile.mkdtemp()
95
+ cropped_video_path = process_video(video_path)
96
+ print(f"Cropped video saved to: {cropped_video_path}")
97
+ video_path=cropped_video_path
98
 
99
+ trimmed_audio_path = process_audio(audio_path, temp_dir)
100
+ print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
101
+ audio_path=trimmed_audio_path
 
 
 
 
102
 
103
  scheduler = DDIMScheduler.from_pretrained("configs")
104
 
 
109
  else:
110
  raise NotImplementedError("cross_attention_dim must be 768 or 384")
111
 
112
+ audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
 
 
 
 
113
 
114
+ vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
 
 
 
115
  vae.config.scaling_factor = 0.18215
116
  vae.config.shift_factor = 0
117
 
118
  unet, _ = UNet3DConditionModel.from_pretrained(
119
  OmegaConf.to_container(config.model),
120
+ inference_ckpt_path, # load checkpoint
121
  device="cpu",
122
  )
123
+
124
  unet = unet.to(dtype=torch.float16)
125
 
 
126
  # set xformers
127
  if is_xformers_available():
128
  unet.enable_xformers_memory_efficient_attention()
 
129
 
130
  pipeline = LipsyncPipeline(
131
  vae=vae,
 
146
  video_out_path = f"video_out{unique_id}.mp4"
147
 
148
  pipeline(
149
+ video_path=video_path,
150
+ audio_path=audio_path,
151
  video_out_path=video_out_path,
152
  video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
153
  num_frames=config.data.num_frames,
 
159
  )
160
 
161
  if is_shared_ui:
162
+ # Clean up the temporary directory
163
  if os.path.exists(temp_dir):
164
  shutil.rmtree(temp_dir)
165
  print(f"Temporary directory {temp_dir} deleted.")
 
167
  return video_out_path
168
 
169
 
170
+ css="""
171
  div#col-container{
172
  margin: 0 auto;
173
  max-width: 982px;
174
  }
175
  """
176
+ with gr.Blocks(css=css) as demo:
 
177
  with gr.Column(elem_id="col-container"):
178
  gr.Markdown("# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync")
179
+ gr.Markdown("LatentSync, an end-to-end lip sync framework based on audio conditioned latent diffusion models without any intermediate motion representation, diverging from previous diffusion-based lip sync methods based on pixel space diffusion or two-stage generation.")
 
 
 
 
180
  gr.HTML("""
181
  <div style="display:flex;column-gap:4px;">
182
  <a href="https://github.com/bytedance/LatentSync">
 
185
  <a href="https://arxiv.org/abs/2412.09262">
186
  <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
187
  </a>
188
+ <a href="https://huggingface.co/spaces/fffiloni/LatentSync?duplicate=true">
189
+ <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
190
  </a>
191
+ <a href="https://huggingface.co/fffiloni">
192
+ <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg" alt="Follow me on HF">
193
  </a>
194
  </div>
195
  """)
 
196
  with gr.Row():
197
  with gr.Column():
198
  video_input = gr.Video(label="Video Control", format="mp4")
199
  audio_input = gr.Audio(label="Audio Input", type="filepath")
200
  submit_btn = gr.Button("Submit")
 
201
  with gr.Column():
202
  video_result = gr.Video(label="Result")
203
 
204
+ gr.Examples(
205
+ examples = [
206
+ ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
207
+ ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
208
+ ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
209
+ ],
210
+ inputs = [video_input, audio_input]
211
+ )
212
+
213
+ submit_btn.click(
214
+ fn = main,
215
+ inputs = [video_input, audio_input],
216
+ outputs = [video_result]
217
+ )
 
218
 
219
+ demo.queue().launch(show_api=False, show_error=True)
 
 
 
 
 
latentsync/models/attention.py CHANGED
@@ -9,10 +9,10 @@ import torch.nn.functional as F
9
  from torch import nn
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
- from diffusers.models.modeling_utils import ModelMixin
13
  from diffusers.utils import BaseOutput
14
  from diffusers.utils.import_utils import is_xformers_available
15
- from diffusers.models.attention import Attention as CrossAttention, FeedForward, AdaLayerNorm
16
 
17
  from einops import rearrange, repeat
18
  from .utils import zero_module
 
9
  from torch import nn
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
+ from diffusers.modeling_utils import ModelMixin
13
  from diffusers.utils import BaseOutput
14
  from diffusers.utils.import_utils import is_xformers_available
15
+ from diffusers.models.attention import CrossAttention, FeedForward, AdaLayerNorm
16
 
17
  from einops import rearrange, repeat
18
  from .utils import zero_module
latentsync/models/motion_module.py CHANGED
@@ -11,10 +11,10 @@ import torch.nn.functional as F
11
  from torch import nn
12
 
13
  from diffusers.configuration_utils import ConfigMixin, register_to_config
14
- from diffusers.models.modeling_utils import ModelMixin
15
  from diffusers.utils import BaseOutput
16
  from diffusers.utils.import_utils import is_xformers_available
17
- from diffusers.models.attention import Attention as CrossAttention, FeedForward
18
 
19
  from einops import rearrange, repeat
20
  import math
 
11
  from torch import nn
12
 
13
  from diffusers.configuration_utils import ConfigMixin, register_to_config
14
+ from diffusers.modeling_utils import ModelMixin
15
  from diffusers.utils import BaseOutput
16
  from diffusers.utils.import_utils import is_xformers_available
17
+ from diffusers.models.attention import CrossAttention, FeedForward
18
 
19
  from einops import rearrange, repeat
20
  import math
latentsync/models/unet.py CHANGED
@@ -9,7 +9,7 @@ import torch.nn as nn
9
  import torch.utils.checkpoint
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
- from diffusers.models.modeling_utils import ModelMixin
13
  from diffusers import UNet2DConditionModel
14
  from diffusers.utils import BaseOutput, logging
15
  from diffusers.models.embeddings import TimestepEmbedding, Timesteps
 
9
  import torch.utils.checkpoint
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
+ from diffusers.modeling_utils import ModelMixin
13
  from diffusers import UNet2DConditionModel
14
  from diffusers.utils import BaseOutput, logging
15
  from diffusers.models.embeddings import TimestepEmbedding, Timesteps
latentsync/pipelines/lipsync_pipeline.py CHANGED
@@ -15,7 +15,7 @@ from packaging import version
15
 
16
  from diffusers.configuration_utils import FrozenDict
17
  from diffusers.models import AutoencoderKL
18
- from diffusers.pipelines.pipeline_utils import DiffusionPipeline
19
  from diffusers.schedulers import (
20
  DDIMScheduler,
21
  DPMSolverMultistepScheduler,
 
15
 
16
  from diffusers.configuration_utils import FrozenDict
17
  from diffusers.models import AutoencoderKL
18
+ from diffusers.pipeline_utils import DiffusionPipeline
19
  from diffusers.schedulers import (
20
  DDIMScheduler,
21
  DPMSolverMultistepScheduler,
requirements.txt CHANGED
@@ -1,21 +1,21 @@
1
- torch==2.5.1
2
- torchvision==0.20.1
3
  --extra-index-url https://download.pytorch.org/whl/cu121
4
- xformers==0.0.29.post1
5
- triton==3.1.0
6
 
7
- diffusers==0.33.1
8
- transformers==4.52.3
9
- huggingface-hub<1.0
10
  imageio==2.27.0
11
  decord==0.6.0
12
  accelerate==0.26.1
13
  einops==0.7.0
14
  omegaconf==2.3.0
15
- safetensors>=0.4.3
16
  opencv-python==4.9.0.80
17
  mediapipe==0.10.11
18
- av
19
  torch-fidelity==0.3.0
20
  torchmetrics==1.3.1
21
  python_speech_features==0.6
@@ -27,8 +27,5 @@ face-alignment==1.4.1
27
  ninja==1.11.1.1
28
  pandas==2.0.3
29
  numpy==1.24.4
30
- pydub==0.25.1
31
- moviepy==1.0.3
32
- hf-xet==1.1.8
33
- spaces
34
- gradio[mcp]
 
1
+ torch==2.2.2
2
+ torchvision==0.17.2
3
  --extra-index-url https://download.pytorch.org/whl/cu121
4
+ xformers==0.0.26
5
+ triton==2.2.0
6
 
7
+ diffusers==0.11.1
8
+ transformers==4.38.0
9
+ huggingface-hub==0.25.2
10
  imageio==2.27.0
11
  decord==0.6.0
12
  accelerate==0.26.1
13
  einops==0.7.0
14
  omegaconf==2.3.0
15
+ safetensors==0.4.2
16
  opencv-python==4.9.0.80
17
  mediapipe==0.10.11
18
+ av==11.0.0
19
  torch-fidelity==0.3.0
20
  torchmetrics==1.3.1
21
  python_speech_features==0.6
 
27
  ninja==1.11.1.1
28
  pandas==2.0.3
29
  numpy==1.24.4
30
+ pydub
31
+ moviepy==1.0.3