Spaces:

LTTEAM
/

Veo3Audio

Runtime error

App Files Files Community

LTTEAM commited on Jul 2, 2025

Commit

21626b4

verified ·

1 Parent(s): e17ad9a

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -21

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import uuid
 import tempfile
 import json
 import inspect
 import torch
 import gradio as gr
@@ -11,12 +12,19 @@ from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
 from diffusers import AutoencoderKL, DDIMScheduler
-# ─── 0. Thiết lập Working Directory & PYTHONPATH ────────────────────
 BASE_DIR = os.path.dirname(__file__)
-# 0.1 chuyển CWD vào LatentSync để tất cả đường dẫn relative nội bộ (mask, configs…) đúng
 os.chdir(os.path.join(BASE_DIR, "LatentSync"))
-# 0.2 thêm Long_Tieng và LatentSync vào sys.path để import modules
 sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
 sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
@@ -39,9 +47,7 @@ mma_cfg.download_if_needed()
 setup_eval_logging()
 net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
 net.load_weights(torch.load(
-    mma_cfg.model_path,
-    map_location=device,
-    weights_only=True
 ))
 feature_utils = FeaturesUtils(
     tod_vae_ckpt=mma_cfg.vae_path,
@@ -109,17 +115,17 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
     return out_video
 # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
-# 2.1 tải checkpoints về local
 REPO_ID = "LTTEAM/Nhep_Mieng"
 ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
 os.makedirs(ckpt_dir, exist_ok=True)
 snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
-# 2.2 load U-Net config
 cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
 conf = OmegaConf.load(cfg_path)
-# 2.3 load scheduler từ config local, lọc bỏ các khóa không hợp lệ
 sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
 with open(sched_path, "r") as f:
     sched_cfg = json.load(f)
@@ -127,7 +133,7 @@ valid_args = inspect.signature(DDIMScheduler.__init__).parameters.keys()
 init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
 scheduler = DDIMScheduler(**init_cfg)
-# 2.4 load VAE và đảm bảo có shift_factor
 vae = AutoencoderKL.from_pretrained(
     "stabilityai/sd-vae-ft-mse",
     torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
@@ -135,17 +141,17 @@ vae = AutoencoderKL.from_pretrained(
 if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
     vae.config.shift_factor = 0.0
-# 2.5 load Whisper encoder
 from latentsync.whisper.audio2feature import Audio2Feature
 dim = conf.model.cross_attention_dim
-wp = "small.pt" if dim == 768 else "tiny.pt"
 audio_encoder = Audio2Feature(
-    model_path=os.path.join(ckpt_dir, "whisper", wp),
     device=device,
     num_frames=conf.data.num_frames
 )
-# 2.6 load UNet3DConditionModel
 from latentsync.models.unet import UNet3DConditionModel
 unet, _ = UNet3DConditionModel.from_pretrained(
     OmegaConf.to_container(conf.model),
@@ -154,7 +160,7 @@ unet, _ = UNet3DConditionModel.from_pretrained(
 )
 unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
-# 2.7 build lipsync pipeline
 from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
 pipe_sync = LipsyncPipeline(
     vae=vae,
@@ -170,7 +176,6 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
     out_id = uuid.uuid4().hex
     result = f"lipsync_{out_id}.mp4"
-    # bắt lỗi face not detected
     try:
         pipe_sync(
             video_path=video_path,
@@ -186,7 +191,7 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
         )
     except RuntimeError as e:
         if "Face not detected" in str(e):
-            raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video rõ ràng.")
         else:
             raise
     return result
@@ -257,11 +262,11 @@ text_video2video = gr.Interface(
     title="Text + Video → Lip-Sync"
 )
-# Tabbed interface với queue để cho phép chạy lâu (timeout=3600s)
 demo = gr.TabbedInterface(
     [text2audio, video2audio, audio2video, text_video2video],
     ["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
-).queue(request_timeout=3600)
-if __name__ == "__main__":
-    demo.launch(share=True)

 import tempfile
 import json
 import inspect
+import shutil
 import torch
 import gradio as gr
 from omegaconf import OmegaConf
 from diffusers import AutoencoderKL, DDIMScheduler
+# ─── 0. Chuyển CWD & thiết lập PYTHONPATH ───────────────────────────
 BASE_DIR = os.path.dirname(__file__)
+# Chuyển working dir vào LatentSync để các đường dẫn relative bên trong đúng
 os.chdir(os.path.join(BASE_DIR, "LatentSync"))
+# Copy mask.png từ assets → latentsync/utils nếu cần
+assets_mask = os.path.join("assets", "mask.png")
+utils_mask  = os.path.join("latentsync", "utils", "mask.png")
+if os.path.exists(assets_mask) and not os.path.exists(utils_mask):
+    shutil.copy(assets_mask, utils_mask)
+# Thêm Long_Tieng và LatentSync vào sys.path để import modules
 sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
 sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
 setup_eval_logging()
 net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
 net.load_weights(torch.load(
+    mma_cfg.model_path, map_location=device, weights_only=True
 ))
 feature_utils = FeaturesUtils(
     tod_vae_ckpt=mma_cfg.vae_path,
     return out_video
 # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
+# 2.1 Download checkpoints
 REPO_ID = "LTTEAM/Nhep_Mieng"
 ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
 os.makedirs(ckpt_dir, exist_ok=True)
 snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
+# 2.2 Load U-Net config
 cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
 conf = OmegaConf.load(cfg_path)
+# 2.3 Load scheduler config locally + filter invalid args
 sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
 with open(sched_path, "r") as f:
     sched_cfg = json.load(f)
 init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
 scheduler = DDIMScheduler(**init_cfg)
+# 2.4 Load VAE and fix missing shift_factor
 vae = AutoencoderKL.from_pretrained(
     "stabilityai/sd-vae-ft-mse",
     torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
 if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
     vae.config.shift_factor = 0.0
+# 2.5 Whisper audio encoder
 from latentsync.whisper.audio2feature import Audio2Feature
 dim = conf.model.cross_attention_dim
+wh = "small.pt" if dim == 768 else "tiny.pt"
 audio_encoder = Audio2Feature(
+    model_path=os.path.join(ckpt_dir, "whisper", wh),
     device=device,
     num_frames=conf.data.num_frames
 )
+# 2.6 Load UNet3DConditionModel
 from latentsync.models.unet import UNet3DConditionModel
 unet, _ = UNet3DConditionModel.from_pretrained(
     OmegaConf.to_container(conf.model),
 )
 unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
+# 2.7 Build LipsyncPipeline
 from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
 pipe_sync = LipsyncPipeline(
     vae=vae,
     out_id = uuid.uuid4().hex
     result = f"lipsync_{out_id}.mp4"
     try:
         pipe_sync(
             video_path=video_path,
         )
     except RuntimeError as e:
         if "Face not detected" in str(e):
+            raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video có khuôn mặt rõ ràng.")
         else:
             raise
     return result
     title="Text + Video → Lip-Sync"
 )
+# Tạo tabbed interface và bật queue (mặc định)
 demo = gr.TabbedInterface(
     [text2audio, video2audio, audio2video, text_video2video],
     ["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
+).queue()
+# Launch với share=True
+demo.launch(share=True)