Spaces:

LTTEAM
/

Veo3Audio

Runtime error

App Files Files Community

LTTEAM commited on Jul 2, 2025

Commit

07d48bb

verified ·

1 Parent(s): a7d3d06

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -26

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import sys
 import uuid
 import tempfile
 import json
-import shutil
 import inspect
 import torch
@@ -12,17 +11,15 @@ from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
 from diffusers import AutoencoderKL, DDIMScheduler
-# ─── 0. Thêm Long_Tieng & LatentSync vào PYTHONPATH ────────────────
-BASE = os.path.dirname(__file__)
-sys.path.insert(0, os.path.join(BASE, "Long_Tieng"))
-sys.path.insert(0, os.path.join(BASE, "LatentSync"))
-# ─── 0.1 Copy mask.png vào latentsync/utils ─────────────────────────
-src_mask = os.path.join(BASE, "LatentSync", "assets", "mask.png")
-dst_utils = os.path.join(BASE, "LatentSync", "latentsync", "utils")
-dst_mask = os.path.join(dst_utils, "mask.png")
-if os.path.exists(src_mask) and not os.path.exists(dst_mask):
-    shutil.copy(src_mask, dst_mask)
 # ─── 1. MMAUDIO (Long_Tieng) setup ─────────────────────────────────
 from mmaudio.eval_utils import (
@@ -36,14 +33,16 @@ from mmaudio.model.utils.features_utils import FeaturesUtils
 from mmaudio.model.networks import MMAudio, get_my_mmaudio
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-dtype  = torch.bfloat16 if device.type=="cuda" else torch.float32
 mma_cfg: ModelConfig = all_model_cfg["large_44k_v2"]
 mma_cfg.download_if_needed()
 setup_eval_logging()
 net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
 net.load_weights(torch.load(
-    mma_cfg.model_path, map_location=device, weights_only=True
 ))
 feature_utils = FeaturesUtils(
     tod_vae_ckpt=mma_cfg.vae_path,
@@ -58,7 +57,8 @@ seq_cfg: SequenceConfig = mma_cfg.seq_cfg
 @torch.inference_mode()
 def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
     rng = torch.Generator(device=device)
-    if seed >= 0: rng.manual_seed(seed)
     fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
     seq_cfg.duration = duration
     net.update_seq_lengths(
@@ -88,7 +88,8 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
     sync = info.sync_frames.unsqueeze(0)
     rng = torch.Generator(device=device)
-    if seed >= 0: rng.manual_seed(seed)
     fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
     seq_cfg.duration = info.duration_sec
@@ -109,30 +110,31 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
     return out_video
 # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
 REPO_ID = "LTTEAM/Nhep_Mieng"
-ckpt_dir = os.path.join(BASE, "checkpoints")
 os.makedirs(ckpt_dir, exist_ok=True)
 snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
-# 2.1 load U-Net config
-cfg_path = os.path.join(BASE, "LatentSync", "configs", "unet", "second_stage.yaml")
 conf = OmegaConf.load(cfg_path)
-# 2.2 load scheduler from local config, filter out unsupported keys
-sched_path = os.path.join(BASE, "LatentSync", "configs", "scheduler_config.json")
 with open(sched_path, "r") as f:
     sched_cfg = json.load(f)
-valid = inspect.signature(DDIMScheduler.__init__).parameters.keys()
-init_cfg = {k: v for k, v in sched_cfg.items() if k in valid}
 scheduler = DDIMScheduler(**init_cfg)
-# 2.3 load VAE
 vae = AutoencoderKL.from_pretrained(
     "stabilityai/sd-vae-ft-mse",
     torch_dtype=torch.float16 if device.type=="cuda" else torch.float32
 )
-# 2.4 load Whisper audio encoder
 from latentsync.whisper.audio2feature import Audio2Feature
 dim = conf.model.cross_attention_dim
 wp = "small.pt" if dim == 768 else "tiny.pt"
@@ -142,7 +144,7 @@ audio_encoder = Audio2Feature(
     num_frames=conf.data.num_frames
 )
-# 2.5 load UNet3DConditionModel
 from latentsync.models.unet import UNet3DConditionModel
 unet, _ = UNet3DConditionModel.from_pretrained(
     OmegaConf.to_container(conf.model),
@@ -151,7 +153,7 @@ unet, _ = UNet3DConditionModel.from_pretrained(
 )
 unet = unet.to(torch.float16) if device.type=="cuda" else unet.to(torch.float32)
-# 2.6 build LipsyncPipeline
 from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
 pipe_sync = LipsyncPipeline(
     vae=vae,

 import uuid
 import tempfile
 import json
 import inspect
 import torch
 from omegaconf import OmegaConf
 from diffusers import AutoencoderKL, DDIMScheduler
+# ─── 0. Thiết lập Working Directory & PYTHONPATH ────────────────────
+BASE_DIR = os.path.dirname(__file__)
+# Chuyển CWD vào thư mục LatentSync để các đường dẫn relative bên trong LatentSync đúng:
+os.chdir(os.path.join(BASE_DIR, "LatentSync"))
+# Sau khi đã chdir, thêm cả hai thư mục Long_Tieng và LatentSync vào sys.path
+# để Python có thể import mmaudio và latentsync
+sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
+sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
 # ─── 1. MMAUDIO (Long_Tieng) setup ─────────────────────────────────
 from mmaudio.eval_utils import (
 from mmaudio.model.networks import MMAudio, get_my_mmaudio
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+dtype  = torch.bfloat16 if device.type == "cuda" else torch.float32
 mma_cfg: ModelConfig = all_model_cfg["large_44k_v2"]
 mma_cfg.download_if_needed()
 setup_eval_logging()
 net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
 net.load_weights(torch.load(
+    mma_cfg.model_path,
+    map_location=device,
+    weights_only=True
 ))
 feature_utils = FeaturesUtils(
     tod_vae_ckpt=mma_cfg.vae_path,
 @torch.inference_mode()
 def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
     rng = torch.Generator(device=device)
+    if seed >= 0:
+        rng.manual_seed(seed)
     fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
     seq_cfg.duration = duration
     net.update_seq_lengths(
     sync = info.sync_frames.unsqueeze(0)
     rng = torch.Generator(device=device)
+    if seed >= 0:
+        rng.manual_seed(seed)
     fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
     seq_cfg.duration = info.duration_sec
     return out_video
 # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
+# 2.1 Download checkpoints về local
 REPO_ID = "LTTEAM/Nhep_Mieng"
+ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
 os.makedirs(ckpt_dir, exist_ok=True)
 snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
+# 2.2 Load cấu hình U-Net
+cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
 conf = OmegaConf.load(cfg_path)
+# 2.3 Load scheduler config từ local và lọc bỏ các trường không tương thích
+sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
 with open(sched_path, "r") as f:
     sched_cfg = json.load(f)
+valid_args = inspect.signature(DDIMScheduler.__init__).parameters.keys()
+init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
 scheduler = DDIMScheduler(**init_cfg)
+# 2.4 Load VAE
 vae = AutoencoderKL.from_pretrained(
     "stabilityai/sd-vae-ft-mse",
     torch_dtype=torch.float16 if device.type=="cuda" else torch.float32
 )
+# 2.5 Whisper audio encoder
 from latentsync.whisper.audio2feature import Audio2Feature
 dim = conf.model.cross_attention_dim
 wp = "small.pt" if dim == 768 else "tiny.pt"
     num_frames=conf.data.num_frames
 )
+# 2.6 Load UNet3DConditionModel
 from latentsync.models.unet import UNet3DConditionModel
 unet, _ = UNet3DConditionModel.from_pretrained(
     OmegaConf.to_container(conf.model),
 )
 unet = unet.to(torch.float16) if device.type=="cuda" else unet.to(torch.float32)
+# 2.7 Build LipsyncPipeline
 from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
 pipe_sync = LipsyncPipeline(
     vae=vae,