Update app.py
Browse files
app.py
CHANGED
|
@@ -3,7 +3,6 @@ import sys
|
|
| 3 |
import uuid
|
| 4 |
import tempfile
|
| 5 |
import json
|
| 6 |
-
import shutil
|
| 7 |
import inspect
|
| 8 |
|
| 9 |
import torch
|
|
@@ -12,17 +11,15 @@ from huggingface_hub import snapshot_download
|
|
| 12 |
from omegaconf import OmegaConf
|
| 13 |
from diffusers import AutoencoderKL, DDIMScheduler
|
| 14 |
|
| 15 |
-
# βββ 0.
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
|
| 20 |
-
#
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
if os.path.exists(src_mask) and not os.path.exists(dst_mask):
|
| 25 |
-
shutil.copy(src_mask, dst_mask)
|
| 26 |
|
| 27 |
# βββ 1. MMAUDIO (Long_Tieng) setup βββββββββββββββββββββββββββββββββ
|
| 28 |
from mmaudio.eval_utils import (
|
|
@@ -36,14 +33,16 @@ from mmaudio.model.utils.features_utils import FeaturesUtils
|
|
| 36 |
from mmaudio.model.networks import MMAudio, get_my_mmaudio
|
| 37 |
|
| 38 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 39 |
-
dtype = torch.bfloat16 if device.type=="cuda" else torch.float32
|
| 40 |
|
| 41 |
mma_cfg: ModelConfig = all_model_cfg["large_44k_v2"]
|
| 42 |
mma_cfg.download_if_needed()
|
| 43 |
setup_eval_logging()
|
| 44 |
net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
|
| 45 |
net.load_weights(torch.load(
|
| 46 |
-
mma_cfg.model_path,
|
|
|
|
|
|
|
| 47 |
))
|
| 48 |
feature_utils = FeaturesUtils(
|
| 49 |
tod_vae_ckpt=mma_cfg.vae_path,
|
|
@@ -58,7 +57,8 @@ seq_cfg: SequenceConfig = mma_cfg.seq_cfg
|
|
| 58 |
@torch.inference_mode()
|
| 59 |
def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
|
| 60 |
rng = torch.Generator(device=device)
|
| 61 |
-
if seed >= 0:
|
|
|
|
| 62 |
fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
|
| 63 |
seq_cfg.duration = duration
|
| 64 |
net.update_seq_lengths(
|
|
@@ -88,7 +88,8 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
|
|
| 88 |
sync = info.sync_frames.unsqueeze(0)
|
| 89 |
|
| 90 |
rng = torch.Generator(device=device)
|
| 91 |
-
if seed >= 0:
|
|
|
|
| 92 |
fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
|
| 93 |
|
| 94 |
seq_cfg.duration = info.duration_sec
|
|
@@ -109,30 +110,31 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
|
|
| 109 |
return out_video
|
| 110 |
|
| 111 |
# βββ 2. LATENTSYNC setup βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 112 |
REPO_ID = "LTTEAM/Nhep_Mieng"
|
| 113 |
-
ckpt_dir = os.path.join(
|
| 114 |
os.makedirs(ckpt_dir, exist_ok=True)
|
| 115 |
snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
|
| 116 |
|
| 117 |
-
# 2.
|
| 118 |
-
cfg_path = os.path.join(
|
| 119 |
conf = OmegaConf.load(cfg_path)
|
| 120 |
|
| 121 |
-
# 2.
|
| 122 |
-
sched_path = os.path.join(
|
| 123 |
with open(sched_path, "r") as f:
|
| 124 |
sched_cfg = json.load(f)
|
| 125 |
-
|
| 126 |
-
init_cfg = {k: v for k, v in sched_cfg.items() if k in
|
| 127 |
scheduler = DDIMScheduler(**init_cfg)
|
| 128 |
|
| 129 |
-
# 2.
|
| 130 |
vae = AutoencoderKL.from_pretrained(
|
| 131 |
"stabilityai/sd-vae-ft-mse",
|
| 132 |
torch_dtype=torch.float16 if device.type=="cuda" else torch.float32
|
| 133 |
)
|
| 134 |
|
| 135 |
-
# 2.
|
| 136 |
from latentsync.whisper.audio2feature import Audio2Feature
|
| 137 |
dim = conf.model.cross_attention_dim
|
| 138 |
wp = "small.pt" if dim == 768 else "tiny.pt"
|
|
@@ -142,7 +144,7 @@ audio_encoder = Audio2Feature(
|
|
| 142 |
num_frames=conf.data.num_frames
|
| 143 |
)
|
| 144 |
|
| 145 |
-
# 2.
|
| 146 |
from latentsync.models.unet import UNet3DConditionModel
|
| 147 |
unet, _ = UNet3DConditionModel.from_pretrained(
|
| 148 |
OmegaConf.to_container(conf.model),
|
|
@@ -151,7 +153,7 @@ unet, _ = UNet3DConditionModel.from_pretrained(
|
|
| 151 |
)
|
| 152 |
unet = unet.to(torch.float16) if device.type=="cuda" else unet.to(torch.float32)
|
| 153 |
|
| 154 |
-
# 2.
|
| 155 |
from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
|
| 156 |
pipe_sync = LipsyncPipeline(
|
| 157 |
vae=vae,
|
|
|
|
| 3 |
import uuid
|
| 4 |
import tempfile
|
| 5 |
import json
|
|
|
|
| 6 |
import inspect
|
| 7 |
|
| 8 |
import torch
|
|
|
|
| 11 |
from omegaconf import OmegaConf
|
| 12 |
from diffusers import AutoencoderKL, DDIMScheduler
|
| 13 |
|
| 14 |
+
# βββ 0. ThiαΊΏt lαΊp Working Directory & PYTHONPATH ββββββββββββββββββββ
|
| 15 |
+
BASE_DIR = os.path.dirname(__file__)
|
| 16 |
+
# Chuyα»n CWD vΓ o thΖ° mα»₯c LatentSync Δα» cΓ‘c ΔΖ°α»ng dαΊ«n relative bΓͺn trong LatentSync ΔΓΊng:
|
| 17 |
+
os.chdir(os.path.join(BASE_DIR, "LatentSync"))
|
| 18 |
|
| 19 |
+
# Sau khi ΔΓ£ chdir, thΓͺm cαΊ£ hai thΖ° mα»₯c Long_Tieng vΓ LatentSync vΓ o sys.path
|
| 20 |
+
# Δα» Python cΓ³ thα» import mmaudio vΓ latentsync
|
| 21 |
+
sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
|
| 22 |
+
sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# βββ 1. MMAUDIO (Long_Tieng) setup βββββββββββββββββββββββββββββββββ
|
| 25 |
from mmaudio.eval_utils import (
|
|
|
|
| 33 |
from mmaudio.model.networks import MMAudio, get_my_mmaudio
|
| 34 |
|
| 35 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 36 |
+
dtype = torch.bfloat16 if device.type == "cuda" else torch.float32
|
| 37 |
|
| 38 |
mma_cfg: ModelConfig = all_model_cfg["large_44k_v2"]
|
| 39 |
mma_cfg.download_if_needed()
|
| 40 |
setup_eval_logging()
|
| 41 |
net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
|
| 42 |
net.load_weights(torch.load(
|
| 43 |
+
mma_cfg.model_path,
|
| 44 |
+
map_location=device,
|
| 45 |
+
weights_only=True
|
| 46 |
))
|
| 47 |
feature_utils = FeaturesUtils(
|
| 48 |
tod_vae_ckpt=mma_cfg.vae_path,
|
|
|
|
| 57 |
@torch.inference_mode()
|
| 58 |
def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
|
| 59 |
rng = torch.Generator(device=device)
|
| 60 |
+
if seed >= 0:
|
| 61 |
+
rng.manual_seed(seed)
|
| 62 |
fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
|
| 63 |
seq_cfg.duration = duration
|
| 64 |
net.update_seq_lengths(
|
|
|
|
| 88 |
sync = info.sync_frames.unsqueeze(0)
|
| 89 |
|
| 90 |
rng = torch.Generator(device=device)
|
| 91 |
+
if seed >= 0:
|
| 92 |
+
rng.manual_seed(seed)
|
| 93 |
fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
|
| 94 |
|
| 95 |
seq_cfg.duration = info.duration_sec
|
|
|
|
| 110 |
return out_video
|
| 111 |
|
| 112 |
# βββ 2. LATENTSYNC setup βββββββββββββββββββββββββββββββββββββββββββββ
|
| 113 |
+
# 2.1 Download checkpoints vα» local
|
| 114 |
REPO_ID = "LTTEAM/Nhep_Mieng"
|
| 115 |
+
ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
|
| 116 |
os.makedirs(ckpt_dir, exist_ok=True)
|
| 117 |
snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
|
| 118 |
|
| 119 |
+
# 2.2 Load cαΊ₯u hΓ¬nh U-Net
|
| 120 |
+
cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
|
| 121 |
conf = OmegaConf.load(cfg_path)
|
| 122 |
|
| 123 |
+
# 2.3 Load scheduler config tα»« local vΓ lα»c bα» cΓ‘c trΖ°α»ng khΓ΄ng tΖ°Ζ‘ng thΓch
|
| 124 |
+
sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
|
| 125 |
with open(sched_path, "r") as f:
|
| 126 |
sched_cfg = json.load(f)
|
| 127 |
+
valid_args = inspect.signature(DDIMScheduler.__init__).parameters.keys()
|
| 128 |
+
init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
|
| 129 |
scheduler = DDIMScheduler(**init_cfg)
|
| 130 |
|
| 131 |
+
# 2.4 Load VAE
|
| 132 |
vae = AutoencoderKL.from_pretrained(
|
| 133 |
"stabilityai/sd-vae-ft-mse",
|
| 134 |
torch_dtype=torch.float16 if device.type=="cuda" else torch.float32
|
| 135 |
)
|
| 136 |
|
| 137 |
+
# 2.5 Whisper audio encoder
|
| 138 |
from latentsync.whisper.audio2feature import Audio2Feature
|
| 139 |
dim = conf.model.cross_attention_dim
|
| 140 |
wp = "small.pt" if dim == 768 else "tiny.pt"
|
|
|
|
| 144 |
num_frames=conf.data.num_frames
|
| 145 |
)
|
| 146 |
|
| 147 |
+
# 2.6 Load UNet3DConditionModel
|
| 148 |
from latentsync.models.unet import UNet3DConditionModel
|
| 149 |
unet, _ = UNet3DConditionModel.from_pretrained(
|
| 150 |
OmegaConf.to_container(conf.model),
|
|
|
|
| 153 |
)
|
| 154 |
unet = unet.to(torch.float16) if device.type=="cuda" else unet.to(torch.float32)
|
| 155 |
|
| 156 |
+
# 2.7 Build LipsyncPipeline
|
| 157 |
from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
|
| 158 |
pipe_sync = LipsyncPipeline(
|
| 159 |
vae=vae,
|