LTTEAM commited on
Commit
21626b4
·
verified ·
1 Parent(s): e17ad9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -21
app.py CHANGED
@@ -4,6 +4,7 @@ import uuid
4
  import tempfile
5
  import json
6
  import inspect
 
7
 
8
  import torch
9
  import gradio as gr
@@ -11,12 +12,19 @@ from huggingface_hub import snapshot_download
11
  from omegaconf import OmegaConf
12
  from diffusers import AutoencoderKL, DDIMScheduler
13
 
14
- # ─── 0. Thiết lập Working Directory & PYTHONPATH ────────────────────
15
  BASE_DIR = os.path.dirname(__file__)
16
- # 0.1 chuyển CWD vào LatentSync để tất cả đường dẫn relative nội bộ (mask, configs…) đúng
 
17
  os.chdir(os.path.join(BASE_DIR, "LatentSync"))
18
 
19
- # 0.2 thêm Long_Tieng LatentSync vào sys.path để import modules
 
 
 
 
 
 
20
  sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
21
  sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
22
 
@@ -39,9 +47,7 @@ mma_cfg.download_if_needed()
39
  setup_eval_logging()
40
  net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
41
  net.load_weights(torch.load(
42
- mma_cfg.model_path,
43
- map_location=device,
44
- weights_only=True
45
  ))
46
  feature_utils = FeaturesUtils(
47
  tod_vae_ckpt=mma_cfg.vae_path,
@@ -109,17 +115,17 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
109
  return out_video
110
 
111
  # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
112
- # 2.1 tải checkpoints về local
113
  REPO_ID = "LTTEAM/Nhep_Mieng"
114
  ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
115
  os.makedirs(ckpt_dir, exist_ok=True)
116
  snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
117
 
118
- # 2.2 load U-Net config
119
  cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
120
  conf = OmegaConf.load(cfg_path)
121
 
122
- # 2.3 load scheduler từ config local, lọc bỏ các khóa không hợp lệ
123
  sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
124
  with open(sched_path, "r") as f:
125
  sched_cfg = json.load(f)
@@ -127,7 +133,7 @@ valid_args = inspect.signature(DDIMScheduler.__init__).parameters.keys()
127
  init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
128
  scheduler = DDIMScheduler(**init_cfg)
129
 
130
- # 2.4 load VAE đảm bảo shift_factor
131
  vae = AutoencoderKL.from_pretrained(
132
  "stabilityai/sd-vae-ft-mse",
133
  torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
@@ -135,17 +141,17 @@ vae = AutoencoderKL.from_pretrained(
135
  if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
136
  vae.config.shift_factor = 0.0
137
 
138
- # 2.5 load Whisper encoder
139
  from latentsync.whisper.audio2feature import Audio2Feature
140
  dim = conf.model.cross_attention_dim
141
- wp = "small.pt" if dim == 768 else "tiny.pt"
142
  audio_encoder = Audio2Feature(
143
- model_path=os.path.join(ckpt_dir, "whisper", wp),
144
  device=device,
145
  num_frames=conf.data.num_frames
146
  )
147
 
148
- # 2.6 load UNet3DConditionModel
149
  from latentsync.models.unet import UNet3DConditionModel
150
  unet, _ = UNet3DConditionModel.from_pretrained(
151
  OmegaConf.to_container(conf.model),
@@ -154,7 +160,7 @@ unet, _ = UNet3DConditionModel.from_pretrained(
154
  )
155
  unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
156
 
157
- # 2.7 build lipsync pipeline
158
  from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
159
  pipe_sync = LipsyncPipeline(
160
  vae=vae,
@@ -170,7 +176,6 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
170
 
171
  out_id = uuid.uuid4().hex
172
  result = f"lipsync_{out_id}.mp4"
173
- # bắt lỗi face not detected
174
  try:
175
  pipe_sync(
176
  video_path=video_path,
@@ -186,7 +191,7 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
186
  )
187
  except RuntimeError as e:
188
  if "Face not detected" in str(e):
189
- raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video rõ ràng.")
190
  else:
191
  raise
192
  return result
@@ -257,11 +262,11 @@ text_video2video = gr.Interface(
257
  title="Text + Video → Lip-Sync"
258
  )
259
 
260
- # Tabbed interface với queue để cho phép chạy lâu (timeout=3600s)
261
  demo = gr.TabbedInterface(
262
  [text2audio, video2audio, audio2video, text_video2video],
263
  ["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
264
- ).queue(request_timeout=3600)
265
 
266
- if __name__ == "__main__":
267
- demo.launch(share=True)
 
4
  import tempfile
5
  import json
6
  import inspect
7
+ import shutil
8
 
9
  import torch
10
  import gradio as gr
 
12
  from omegaconf import OmegaConf
13
  from diffusers import AutoencoderKL, DDIMScheduler
14
 
15
+ # ─── 0. Chuyển CWD & thiết lập PYTHONPATH ───────────────────────────
16
  BASE_DIR = os.path.dirname(__file__)
17
+
18
+ # Chuyển working dir vào LatentSync để các đường dẫn relative bên trong đúng
19
  os.chdir(os.path.join(BASE_DIR, "LatentSync"))
20
 
21
+ # Copy mask.png từ assets latentsync/utils nếu cần
22
+ assets_mask = os.path.join("assets", "mask.png")
23
+ utils_mask = os.path.join("latentsync", "utils", "mask.png")
24
+ if os.path.exists(assets_mask) and not os.path.exists(utils_mask):
25
+ shutil.copy(assets_mask, utils_mask)
26
+
27
+ # Thêm Long_Tieng và LatentSync vào sys.path để import modules
28
  sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
29
  sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
30
 
 
47
  setup_eval_logging()
48
  net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
49
  net.load_weights(torch.load(
50
+ mma_cfg.model_path, map_location=device, weights_only=True
 
 
51
  ))
52
  feature_utils = FeaturesUtils(
53
  tod_vae_ckpt=mma_cfg.vae_path,
 
115
  return out_video
116
 
117
  # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
118
+ # 2.1 Download checkpoints
119
  REPO_ID = "LTTEAM/Nhep_Mieng"
120
  ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
121
  os.makedirs(ckpt_dir, exist_ok=True)
122
  snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
123
 
124
+ # 2.2 Load U-Net config
125
  cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
126
  conf = OmegaConf.load(cfg_path)
127
 
128
+ # 2.3 Load scheduler config locally + filter invalid args
129
  sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
130
  with open(sched_path, "r") as f:
131
  sched_cfg = json.load(f)
 
133
  init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
134
  scheduler = DDIMScheduler(**init_cfg)
135
 
136
+ # 2.4 Load VAE and fix missing shift_factor
137
  vae = AutoencoderKL.from_pretrained(
138
  "stabilityai/sd-vae-ft-mse",
139
  torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
 
141
  if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
142
  vae.config.shift_factor = 0.0
143
 
144
+ # 2.5 Whisper audio encoder
145
  from latentsync.whisper.audio2feature import Audio2Feature
146
  dim = conf.model.cross_attention_dim
147
+ wh = "small.pt" if dim == 768 else "tiny.pt"
148
  audio_encoder = Audio2Feature(
149
+ model_path=os.path.join(ckpt_dir, "whisper", wh),
150
  device=device,
151
  num_frames=conf.data.num_frames
152
  )
153
 
154
+ # 2.6 Load UNet3DConditionModel
155
  from latentsync.models.unet import UNet3DConditionModel
156
  unet, _ = UNet3DConditionModel.from_pretrained(
157
  OmegaConf.to_container(conf.model),
 
160
  )
161
  unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
162
 
163
+ # 2.7 Build LipsyncPipeline
164
  from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
165
  pipe_sync = LipsyncPipeline(
166
  vae=vae,
 
176
 
177
  out_id = uuid.uuid4().hex
178
  result = f"lipsync_{out_id}.mp4"
 
179
  try:
180
  pipe_sync(
181
  video_path=video_path,
 
191
  )
192
  except RuntimeError as e:
193
  if "Face not detected" in str(e):
194
+ raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video có khuôn mặt rõ ràng.")
195
  else:
196
  raise
197
  return result
 
262
  title="Text + Video → Lip-Sync"
263
  )
264
 
265
+ # Tạo tabbed interface bật queue (mặc định)
266
  demo = gr.TabbedInterface(
267
  [text2audio, video2audio, audio2video, text_video2video],
268
  ["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
269
+ ).queue()
270
 
271
+ # Launch với share=True
272
+ demo.launch(share=True)