LTTEAM commited on
Commit
e17ad9a
·
verified ·
1 Parent(s): f236d0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -17
app.py CHANGED
@@ -11,12 +11,12 @@ from huggingface_hub import snapshot_download
11
  from omegaconf import OmegaConf
12
  from diffusers import AutoencoderKL, DDIMScheduler
13
 
14
- # ─── 0. Chuyển CWD & thiết lập PYTHONPATH ───────────────────────────
15
  BASE_DIR = os.path.dirname(__file__)
16
- # Chuyển working directory vào LatentSync để các đường dẫn relative nội bộ đúng
17
  os.chdir(os.path.join(BASE_DIR, "LatentSync"))
18
 
19
- # Sau khi chdir, thêm Long_Tieng và LatentSync vào sys.path để import modules
20
  sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
21
  sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
22
 
@@ -109,17 +109,17 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
109
  return out_video
110
 
111
  # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
112
- # 2.1 Tải checkpoints
113
  REPO_ID = "LTTEAM/Nhep_Mieng"
114
  ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
115
  os.makedirs(ckpt_dir, exist_ok=True)
116
  snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
117
 
118
- # 2.2 Load cấu hình U-Net
119
  cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
120
  conf = OmegaConf.load(cfg_path)
121
 
122
- # 2.3 Load scheduler config local, loại bỏ các khóa không hợp lệ
123
  sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
124
  with open(sched_path, "r") as f:
125
  sched_cfg = json.load(f)
@@ -127,26 +127,25 @@ valid_args = inspect.signature(DDIMScheduler.__init__).parameters.keys()
127
  init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
128
  scheduler = DDIMScheduler(**init_cfg)
129
 
130
- # 2.4 Load VAE và fix missing shift_factor
131
  vae = AutoencoderKL.from_pretrained(
132
  "stabilityai/sd-vae-ft-mse",
133
  torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
134
  )
135
- # Một số VAE config thiếu shift_factor => default về 0.0
136
  if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
137
  vae.config.shift_factor = 0.0
138
 
139
- # 2.5 Whisper audio encoder
140
  from latentsync.whisper.audio2feature import Audio2Feature
141
  dim = conf.model.cross_attention_dim
142
- whisper_file = "small.pt" if dim == 768 else "tiny.pt"
143
  audio_encoder = Audio2Feature(
144
- model_path=os.path.join(ckpt_dir, "whisper", whisper_file),
145
  device=device,
146
  num_frames=conf.data.num_frames
147
  )
148
 
149
- # 2.6 Load UNet3DConditionModel
150
  from latentsync.models.unet import UNet3DConditionModel
151
  unet, _ = UNet3DConditionModel.from_pretrained(
152
  OmegaConf.to_container(conf.model),
@@ -155,7 +154,7 @@ unet, _ = UNet3DConditionModel.from_pretrained(
155
  )
156
  unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
157
 
158
- # 2.7 Build LipsyncPipeline
159
  from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
160
  pipe_sync = LipsyncPipeline(
161
  vae=vae,
@@ -171,7 +170,7 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
171
 
172
  out_id = uuid.uuid4().hex
173
  result = f"lipsync_{out_id}.mp4"
174
-
175
  try:
176
  pipe_sync(
177
  video_path=video_path,
@@ -187,7 +186,7 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
187
  )
188
  except RuntimeError as e:
189
  if "Face not detected" in str(e):
190
- raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video có khuôn mặt rõ ràng.")
191
  else:
192
  raise
193
  return result
@@ -258,7 +257,11 @@ text_video2video = gr.Interface(
258
  title="Text + Video → Lip-Sync"
259
  )
260
 
261
- gr.TabbedInterface(
 
262
  [text2audio, video2audio, audio2video, text_video2video],
263
  ["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
264
- ).launch(share=True)
 
 
 
 
11
  from omegaconf import OmegaConf
12
  from diffusers import AutoencoderKL, DDIMScheduler
13
 
14
+ # ─── 0. Thiết lập Working Directory & PYTHONPATH ────────────────────
15
  BASE_DIR = os.path.dirname(__file__)
16
+ # 0.1 chuyển CWD vào LatentSync để tất cả đường dẫn relative nội bộ (mask, configs…) đúng
17
  os.chdir(os.path.join(BASE_DIR, "LatentSync"))
18
 
19
+ # 0.2 thêm Long_Tieng và LatentSync vào sys.path để import modules
20
  sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
21
  sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
22
 
 
109
  return out_video
110
 
111
  # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
112
+ # 2.1 tải checkpoints về local
113
  REPO_ID = "LTTEAM/Nhep_Mieng"
114
  ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
115
  os.makedirs(ckpt_dir, exist_ok=True)
116
  snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
117
 
118
+ # 2.2 load U-Net config
119
  cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
120
  conf = OmegaConf.load(cfg_path)
121
 
122
+ # 2.3 load scheduler từ config local, lọc bỏ các khóa không hợp lệ
123
  sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
124
  with open(sched_path, "r") as f:
125
  sched_cfg = json.load(f)
 
127
  init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
128
  scheduler = DDIMScheduler(**init_cfg)
129
 
130
+ # 2.4 load VAE và đảm bảo shift_factor
131
  vae = AutoencoderKL.from_pretrained(
132
  "stabilityai/sd-vae-ft-mse",
133
  torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
134
  )
 
135
  if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
136
  vae.config.shift_factor = 0.0
137
 
138
+ # 2.5 load Whisper encoder
139
  from latentsync.whisper.audio2feature import Audio2Feature
140
  dim = conf.model.cross_attention_dim
141
+ wp = "small.pt" if dim == 768 else "tiny.pt"
142
  audio_encoder = Audio2Feature(
143
+ model_path=os.path.join(ckpt_dir, "whisper", wp),
144
  device=device,
145
  num_frames=conf.data.num_frames
146
  )
147
 
148
+ # 2.6 load UNet3DConditionModel
149
  from latentsync.models.unet import UNet3DConditionModel
150
  unet, _ = UNet3DConditionModel.from_pretrained(
151
  OmegaConf.to_container(conf.model),
 
154
  )
155
  unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
156
 
157
+ # 2.7 build lipsync pipeline
158
  from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
159
  pipe_sync = LipsyncPipeline(
160
  vae=vae,
 
170
 
171
  out_id = uuid.uuid4().hex
172
  result = f"lipsync_{out_id}.mp4"
173
+ # bắt lỗi face not detected
174
  try:
175
  pipe_sync(
176
  video_path=video_path,
 
186
  )
187
  except RuntimeError as e:
188
  if "Face not detected" in str(e):
189
+ raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video rõ ràng.")
190
  else:
191
  raise
192
  return result
 
257
  title="Text + Video → Lip-Sync"
258
  )
259
 
260
+ # Tabbed interface với queue để cho phép chạy lâu (timeout=3600s)
261
+ demo = gr.TabbedInterface(
262
  [text2audio, video2audio, audio2video, text_video2video],
263
  ["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
264
+ ).queue(request_timeout=3600)
265
+
266
+ if __name__ == "__main__":
267
+ demo.launch(share=True)