LTTEAM commited on
Commit
a9e5a39
·
verified ·
1 Parent(s): 185f811

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -42
app.py CHANGED
@@ -1,20 +1,19 @@
1
-
2
  import os
3
  import sys
4
  import uuid
5
  import tempfile
6
-
7
  import torch
8
  import gradio as gr
9
  from huggingface_hub import snapshot_download
10
  from omegaconf import OmegaConf
11
  from diffusers import AutoencoderKL, DDIMScheduler
12
 
13
- # ——————————————————————————
14
- # Cho Python “nhìn” vào 2 thư mục con
15
  BASE_DIR = os.path.dirname(__file__)
16
  sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
17
  sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
 
18
 
19
  # === MMAUDIO (Long_Tieng) setup ===
20
  from mmaudio.eval_utils import (
@@ -28,16 +27,14 @@ from mmaudio.model.utils.features_utils import FeaturesUtils
28
  from mmaudio.model.networks import MMAudio, get_my_mmaudio
29
 
30
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
- dtype = torch.bfloat16 if device.type=="cuda" else torch.float32
32
 
33
- # Load MMAudio model
34
- model: ModelConfig = all_model_cfg["large_44k_v2"]
35
  model.download_if_needed()
36
  setup_eval_logging()
37
  net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
38
- net.load_weights(
39
- torch.load(model.model_path, map_location=device, weights_only=True)
40
- )
41
  feature_utils = FeaturesUtils(
42
  tod_vae_ckpt=model.vae_path,
43
  synchformer_ckpt=model.synchformer_ckpt,
@@ -51,11 +48,8 @@ seq_cfg: SequenceConfig = model.seq_cfg
51
  @torch.inference_mode()
52
  def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
53
  rng = torch.Generator(device=device)
54
- if seed >= 0:
55
- rng.manual_seed(seed)
56
- else:
57
- rng.seed()
58
- fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
59
  seq_cfg.duration = duration
60
  net.update_seq_lengths(
61
  seq_cfg.latent_seq_len,
@@ -76,7 +70,6 @@ def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
76
 
77
  @torch.inference_mode()
78
  def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, duration):
79
- # Từ Long_Tieng eval_utils
80
  from mmaudio.eval_utils import load_video, make_video
81
  from mmaudio.model.flow_matching import FlowMatching
82
 
@@ -85,11 +78,8 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
85
  sync = video_info.sync_frames.unsqueeze(0)
86
 
87
  rng = torch.Generator(device=device)
88
- if seed >= 0:
89
- rng.manual_seed(seed)
90
- else:
91
- rng.seed()
92
- fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
93
 
94
  seq_cfg.duration = video_info.duration_sec
95
  net.update_seq_lengths(
@@ -104,13 +94,13 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
104
  net=net, fm=fm, rng=rng, cfg_strength=guidance
105
  )
106
  audio = audios.float().cpu()[0]
107
- out_vid = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
108
- make_video(video_info, out_vid, audio, sampling_rate=seq_cfg.sampling_rate)
109
- return out_vid
110
 
111
  # === LATENTSYNC setup ===
112
  REPO_ID = "LTTEAM/Nhep_Mieng"
113
- snapshot_download(repo_id=REPO_ID, local_dir="checkpoints", allow_patterns=["*.pt"])
114
 
115
  conf = OmegaConf.load("configs/unet/second_stage.yaml")
116
  vae = AutoencoderKL.from_pretrained(
@@ -138,11 +128,7 @@ unet, _ = UNet3DConditionModel.from_pretrained(
138
  "checkpoints/latentsync_unet.pt",
139
  device=device
140
  )
141
- unet = (
142
- unet.to(dtype=torch.float16)
143
- if device=="cuda" else
144
- unet.to(dtype=torch.float32)
145
- )
146
 
147
  from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
148
  pipe_sync = LipsyncPipeline(
@@ -153,13 +139,9 @@ pipe_sync = LipsyncPipeline(
153
  ).to(device)
154
 
155
  def lipsync_fn(video_path, audio_path, seed, num_frames, steps):
156
- # Từ LatentSync pipelines
157
  from accelerate.utils import set_seed
158
  if seed >= 0:
159
  set_seed(seed)
160
- else:
161
- torch.seed()
162
-
163
  out_id = uuid.uuid4().hex
164
  out_path = f"out_{out_id}.mp4"
165
  pipe_sync(
@@ -176,7 +158,7 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, steps):
176
  )
177
  return out_path
178
 
179
- # === Gradio UI ===
180
  text2audio = gr.Interface(
181
  fn=text_to_audio_fn,
182
  inputs=[
@@ -185,7 +167,7 @@ text2audio = gr.Interface(
185
  gr.Number(label="Seed", value=-1, precision=0),
186
  gr.Number(label="Num Steps", value=25, precision=0),
187
  gr.Number(label="Guidance Strength", value=4.5),
188
- gr.Number(label="Duration (s)", value=8)
189
  ],
190
  outputs=gr.Audio(label="Generated Audio"),
191
  title="Text → Audio"
@@ -200,9 +182,9 @@ video2audio = gr.Interface(
200
  gr.Number(label="Seed", value=-1, precision=0),
201
  gr.Number(label="Num Steps", value=25, precision=0),
202
  gr.Number(label="Guidance Strength", value=4.5),
203
- gr.Number(label="Duration (s)", value=8)
204
  ],
205
- outputs=gr.Video(label="Video + Audio"),
206
  title="Video → Audio"
207
  )
208
 
@@ -213,7 +195,7 @@ audio2video = gr.Interface(
213
  gr.Audio(label="Input Audio", type="filepath"),
214
  gr.Number(label="Seed", value=-1, precision=0),
215
  gr.Number(label="Num Frames", value=16, precision=0),
216
- gr.Number(label="Inference Steps", value=50, precision=0)
217
  ],
218
  outputs=gr.Video(label="Lip-Synced Video"),
219
  title="Audio → Lip-Sync"
@@ -221,8 +203,8 @@ audio2video = gr.Interface(
221
 
222
  text_video2video = gr.Interface(
223
  fn=lambda p,np,sd,ns,gs,du,vid,nf,st: (
224
- text_to_audio_fn(p, np, sd, ns, gs, du),
225
- lipsync_fn(vid, text_to_audio_fn(p, np, sd, ns, gs, du), sd, nf, st)
226
  ),
227
  inputs=[
228
  gr.Textbox(label="Prompt"),
@@ -233,7 +215,7 @@ text_video2video = gr.Interface(
233
  gr.Number(label="Duration (s)", value=8),
234
  gr.Video(label="Input Video"),
235
  gr.Number(label="Num Frames", value=16, precision=0),
236
- gr.Number(label="Inference Steps", value=50, precision=0)
237
  ],
238
  outputs=[gr.Audio(label="Synth Audio"), gr.Video(label="Lip-Synced Video")],
239
  title="Text + Video → Lip-Sync"
 
 
1
  import os
2
  import sys
3
  import uuid
4
  import tempfile
 
5
  import torch
6
  import gradio as gr
7
  from huggingface_hub import snapshot_download
8
  from omegaconf import OmegaConf
9
  from diffusers import AutoencoderKL, DDIMScheduler
10
 
11
+ # -------------------------------------------------------------------
12
+ # Thêm path để Python tìm được các package trong Long_Tieng và LatentSync
13
  BASE_DIR = os.path.dirname(__file__)
14
  sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
15
  sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
16
+ # -------------------------------------------------------------------
17
 
18
  # === MMAUDIO (Long_Tieng) setup ===
19
  from mmaudio.eval_utils import (
 
27
  from mmaudio.model.networks import MMAudio, get_my_mmaudio
28
 
29
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
+ dtype = torch.bfloat16 if device.type == "cuda" else torch.float32
31
 
32
+ # Load mmaudio model
33
+ model: ModelConfig = all_model_cfg['large_44k_v2']
34
  model.download_if_needed()
35
  setup_eval_logging()
36
  net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
37
+ net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
 
 
38
  feature_utils = FeaturesUtils(
39
  tod_vae_ckpt=model.vae_path,
40
  synchformer_ckpt=model.synchformer_ckpt,
 
48
  @torch.inference_mode()
49
  def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
50
  rng = torch.Generator(device=device)
51
+ if seed >= 0: rng.manual_seed(seed)
52
+ fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
 
 
 
53
  seq_cfg.duration = duration
54
  net.update_seq_lengths(
55
  seq_cfg.latent_seq_len,
 
70
 
71
  @torch.inference_mode()
72
  def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, duration):
 
73
  from mmaudio.eval_utils import load_video, make_video
74
  from mmaudio.model.flow_matching import FlowMatching
75
 
 
78
  sync = video_info.sync_frames.unsqueeze(0)
79
 
80
  rng = torch.Generator(device=device)
81
+ if seed >= 0: rng.manual_seed(seed)
82
+ fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
 
 
 
83
 
84
  seq_cfg.duration = video_info.duration_sec
85
  net.update_seq_lengths(
 
94
  net=net, fm=fm, rng=rng, cfg_strength=guidance
95
  )
96
  audio = audios.float().cpu()[0]
97
+ video_out = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
98
+ make_video(video_info, video_out, audio, sampling_rate=seq_cfg.sampling_rate)
99
+ return video_out
100
 
101
  # === LATENTSYNC setup ===
102
  REPO_ID = "LTTEAM/Nhep_Mieng"
103
+ snapshot_download(repo_id=REPO_ID, local_dir="checkpoints")
104
 
105
  conf = OmegaConf.load("configs/unet/second_stage.yaml")
106
  vae = AutoencoderKL.from_pretrained(
 
128
  "checkpoints/latentsync_unet.pt",
129
  device=device
130
  )
131
+ unet = unet.to(dtype=torch.float16) if device=="cuda" else unet.to(dtype=torch.float32)
 
 
 
 
132
 
133
  from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
134
  pipe_sync = LipsyncPipeline(
 
139
  ).to(device)
140
 
141
  def lipsync_fn(video_path, audio_path, seed, num_frames, steps):
 
142
  from accelerate.utils import set_seed
143
  if seed >= 0:
144
  set_seed(seed)
 
 
 
145
  out_id = uuid.uuid4().hex
146
  out_path = f"out_{out_id}.mp4"
147
  pipe_sync(
 
158
  )
159
  return out_path
160
 
161
+ # === BUILD GRADIO UI ===
162
  text2audio = gr.Interface(
163
  fn=text_to_audio_fn,
164
  inputs=[
 
167
  gr.Number(label="Seed", value=-1, precision=0),
168
  gr.Number(label="Num Steps", value=25, precision=0),
169
  gr.Number(label="Guidance Strength", value=4.5),
170
+ gr.Number(label="Duration (s)", value=8),
171
  ],
172
  outputs=gr.Audio(label="Generated Audio"),
173
  title="Text → Audio"
 
182
  gr.Number(label="Seed", value=-1, precision=0),
183
  gr.Number(label="Num Steps", value=25, precision=0),
184
  gr.Number(label="Guidance Strength", value=4.5),
185
+ gr.Number(label="Duration (s)", value=8),
186
  ],
187
+ outputs=gr.Video(label="Video with Audio"),
188
  title="Video → Audio"
189
  )
190
 
 
195
  gr.Audio(label="Input Audio", type="filepath"),
196
  gr.Number(label="Seed", value=-1, precision=0),
197
  gr.Number(label="Num Frames", value=16, precision=0),
198
+ gr.Number(label="Inference Steps", value=50, precision=0),
199
  ],
200
  outputs=gr.Video(label="Lip-Synced Video"),
201
  title="Audio → Lip-Sync"
 
203
 
204
  text_video2video = gr.Interface(
205
  fn=lambda p,np,sd,ns,gs,du,vid,nf,st: (
206
+ text_to_audio_fn(p,np,sd,ns,gs,du),
207
+ lipsync_fn(vid, text_to_audio_fn(p,np,sd,ns,gs,du), sd, nf, st)
208
  ),
209
  inputs=[
210
  gr.Textbox(label="Prompt"),
 
215
  gr.Number(label="Duration (s)", value=8),
216
  gr.Video(label="Input Video"),
217
  gr.Number(label="Num Frames", value=16, precision=0),
218
+ gr.Number(label="Inference Steps", value=50, precision=0),
219
  ],
220
  outputs=[gr.Audio(label="Synth Audio"), gr.Video(label="Lip-Synced Video")],
221
  title="Text + Video → Lip-Sync"