Spaces:

LTTEAM
/

Veo3Audio

Paused

App Files Files Community

LTTEAM commited on Jul 2, 2025

Commit

e17ad9a

verified ·

1 Parent(s): f236d0d

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -17

app.py CHANGED Viewed

@@ -11,12 +11,12 @@ from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
 from diffusers import AutoencoderKL, DDIMScheduler
-# ─── 0. Chuyển CWD & thiết lập PYTHONPATH ───────────────────────────
 BASE_DIR = os.path.dirname(__file__)
-# Chuyển working directory vào LatentSync để các đường dẫn relative nội bộ đúng
 os.chdir(os.path.join(BASE_DIR, "LatentSync"))
-# Sau khi chdir, thêm Long_Tieng và LatentSync vào sys.path để import modules
 sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
 sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
@@ -109,17 +109,17 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
     return out_video
 # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
-# 2.1 Tải checkpoints
 REPO_ID = "LTTEAM/Nhep_Mieng"
 ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
 os.makedirs(ckpt_dir, exist_ok=True)
 snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
-# 2.2 Load cấu hình U-Net
 cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
 conf = OmegaConf.load(cfg_path)
-# 2.3 Load scheduler config local, loại bỏ các khóa không hợp lệ
 sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
 with open(sched_path, "r") as f:
     sched_cfg = json.load(f)
@@ -127,26 +127,25 @@ valid_args = inspect.signature(DDIMScheduler.__init__).parameters.keys()
 init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
 scheduler = DDIMScheduler(**init_cfg)
-# 2.4 Load VAE và fix missing shift_factor
 vae = AutoencoderKL.from_pretrained(
     "stabilityai/sd-vae-ft-mse",
     torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
 )
-# Một số VAE config thiếu shift_factor => default về 0.0
 if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
     vae.config.shift_factor = 0.0
-# 2.5 Whisper audio encoder
 from latentsync.whisper.audio2feature import Audio2Feature
 dim = conf.model.cross_attention_dim
-whisper_file = "small.pt" if dim == 768 else "tiny.pt"
 audio_encoder = Audio2Feature(
-    model_path=os.path.join(ckpt_dir, "whisper", whisper_file),
     device=device,
     num_frames=conf.data.num_frames
 )
-# 2.6 Load UNet3DConditionModel
 from latentsync.models.unet import UNet3DConditionModel
 unet, _ = UNet3DConditionModel.from_pretrained(
     OmegaConf.to_container(conf.model),
@@ -155,7 +154,7 @@ unet, _ = UNet3DConditionModel.from_pretrained(
 )
 unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
-# 2.7 Build LipsyncPipeline
 from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
 pipe_sync = LipsyncPipeline(
     vae=vae,
@@ -171,7 +170,7 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
     out_id = uuid.uuid4().hex
     result = f"lipsync_{out_id}.mp4"
     try:
         pipe_sync(
             video_path=video_path,
@@ -187,7 +186,7 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
         )
     except RuntimeError as e:
         if "Face not detected" in str(e):
-            raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video có khuôn mặt rõ ràng.")
         else:
             raise
     return result
@@ -258,7 +257,11 @@ text_video2video = gr.Interface(
     title="Text + Video → Lip-Sync"
 )
-gr.TabbedInterface(
     [text2audio, video2audio, audio2video, text_video2video],
     ["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
-).launch(share=True)

 from omegaconf import OmegaConf
 from diffusers import AutoencoderKL, DDIMScheduler
+# ─── 0. Thiết lập Working Directory & PYTHONPATH ────────────────────
 BASE_DIR = os.path.dirname(__file__)
+# 0.1 chuyển CWD vào LatentSync để tất cả đường dẫn relative nội bộ (mask, configs…) đúng
 os.chdir(os.path.join(BASE_DIR, "LatentSync"))
+# 0.2 thêm Long_Tieng và LatentSync vào sys.path để import modules
 sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
 sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
     return out_video
 # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
+# 2.1 tải checkpoints về local
 REPO_ID = "LTTEAM/Nhep_Mieng"
 ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
 os.makedirs(ckpt_dir, exist_ok=True)
 snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
+# 2.2 load U-Net config
 cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
 conf = OmegaConf.load(cfg_path)
+# 2.3 load scheduler từ config local, lọc bỏ các khóa không hợp lệ
 sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
 with open(sched_path, "r") as f:
     sched_cfg = json.load(f)
 init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
 scheduler = DDIMScheduler(**init_cfg)
+# 2.4 load VAE và đảm bảo có shift_factor
 vae = AutoencoderKL.from_pretrained(
     "stabilityai/sd-vae-ft-mse",
     torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
 )
 if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
     vae.config.shift_factor = 0.0
+# 2.5 load Whisper encoder
 from latentsync.whisper.audio2feature import Audio2Feature
 dim = conf.model.cross_attention_dim
+wp = "small.pt" if dim == 768 else "tiny.pt"
 audio_encoder = Audio2Feature(
+    model_path=os.path.join(ckpt_dir, "whisper", wp),
     device=device,
     num_frames=conf.data.num_frames
 )
+# 2.6 load UNet3DConditionModel
 from latentsync.models.unet import UNet3DConditionModel
 unet, _ = UNet3DConditionModel.from_pretrained(
     OmegaConf.to_container(conf.model),
 )
 unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
+# 2.7 build lipsync pipeline
 from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
 pipe_sync = LipsyncPipeline(
     vae=vae,
     out_id = uuid.uuid4().hex
     result = f"lipsync_{out_id}.mp4"
+    # bắt lỗi face not detected
     try:
         pipe_sync(
             video_path=video_path,
         )
     except RuntimeError as e:
         if "Face not detected" in str(e):
+            raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video rõ ràng.")
         else:
             raise
     return result
     title="Text + Video → Lip-Sync"
 )
+# Tabbed interface với queue để cho phép chạy lâu (timeout=3600s)
+demo = gr.TabbedInterface(
     [text2audio, video2audio, audio2video, text_video2video],
     ["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
+).queue(request_timeout=3600)
+if __name__ == "__main__":
+    demo.launch(share=True)