Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import uuid
|
|
| 4 |
import tempfile
|
| 5 |
import json
|
| 6 |
import inspect
|
|
|
|
| 7 |
|
| 8 |
import torch
|
| 9 |
import gradio as gr
|
|
@@ -11,12 +12,19 @@ from huggingface_hub import snapshot_download
|
|
| 11 |
from omegaconf import OmegaConf
|
| 12 |
from diffusers import AutoencoderKL, DDIMScheduler
|
| 13 |
|
| 14 |
-
# ─── 0.
|
| 15 |
BASE_DIR = os.path.dirname(__file__)
|
| 16 |
-
|
|
|
|
| 17 |
os.chdir(os.path.join(BASE_DIR, "LatentSync"))
|
| 18 |
|
| 19 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
|
| 21 |
sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
|
| 22 |
|
|
@@ -39,9 +47,7 @@ mma_cfg.download_if_needed()
|
|
| 39 |
setup_eval_logging()
|
| 40 |
net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
|
| 41 |
net.load_weights(torch.load(
|
| 42 |
-
mma_cfg.model_path,
|
| 43 |
-
map_location=device,
|
| 44 |
-
weights_only=True
|
| 45 |
))
|
| 46 |
feature_utils = FeaturesUtils(
|
| 47 |
tod_vae_ckpt=mma_cfg.vae_path,
|
|
@@ -109,17 +115,17 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
|
|
| 109 |
return out_video
|
| 110 |
|
| 111 |
# ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
|
| 112 |
-
# 2.1
|
| 113 |
REPO_ID = "LTTEAM/Nhep_Mieng"
|
| 114 |
ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
|
| 115 |
os.makedirs(ckpt_dir, exist_ok=True)
|
| 116 |
snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
|
| 117 |
|
| 118 |
-
# 2.2
|
| 119 |
cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
|
| 120 |
conf = OmegaConf.load(cfg_path)
|
| 121 |
|
| 122 |
-
# 2.3
|
| 123 |
sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
|
| 124 |
with open(sched_path, "r") as f:
|
| 125 |
sched_cfg = json.load(f)
|
|
@@ -127,7 +133,7 @@ valid_args = inspect.signature(DDIMScheduler.__init__).parameters.keys()
|
|
| 127 |
init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
|
| 128 |
scheduler = DDIMScheduler(**init_cfg)
|
| 129 |
|
| 130 |
-
# 2.4
|
| 131 |
vae = AutoencoderKL.from_pretrained(
|
| 132 |
"stabilityai/sd-vae-ft-mse",
|
| 133 |
torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
|
|
@@ -135,17 +141,17 @@ vae = AutoencoderKL.from_pretrained(
|
|
| 135 |
if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
|
| 136 |
vae.config.shift_factor = 0.0
|
| 137 |
|
| 138 |
-
# 2.5
|
| 139 |
from latentsync.whisper.audio2feature import Audio2Feature
|
| 140 |
dim = conf.model.cross_attention_dim
|
| 141 |
-
|
| 142 |
audio_encoder = Audio2Feature(
|
| 143 |
-
model_path=os.path.join(ckpt_dir, "whisper",
|
| 144 |
device=device,
|
| 145 |
num_frames=conf.data.num_frames
|
| 146 |
)
|
| 147 |
|
| 148 |
-
# 2.6
|
| 149 |
from latentsync.models.unet import UNet3DConditionModel
|
| 150 |
unet, _ = UNet3DConditionModel.from_pretrained(
|
| 151 |
OmegaConf.to_container(conf.model),
|
|
@@ -154,7 +160,7 @@ unet, _ = UNet3DConditionModel.from_pretrained(
|
|
| 154 |
)
|
| 155 |
unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
|
| 156 |
|
| 157 |
-
# 2.7
|
| 158 |
from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
|
| 159 |
pipe_sync = LipsyncPipeline(
|
| 160 |
vae=vae,
|
|
@@ -170,7 +176,6 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
|
|
| 170 |
|
| 171 |
out_id = uuid.uuid4().hex
|
| 172 |
result = f"lipsync_{out_id}.mp4"
|
| 173 |
-
# bắt lỗi face not detected
|
| 174 |
try:
|
| 175 |
pipe_sync(
|
| 176 |
video_path=video_path,
|
|
@@ -186,7 +191,7 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
|
|
| 186 |
)
|
| 187 |
except RuntimeError as e:
|
| 188 |
if "Face not detected" in str(e):
|
| 189 |
-
raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video rõ ràng.")
|
| 190 |
else:
|
| 191 |
raise
|
| 192 |
return result
|
|
@@ -257,11 +262,11 @@ text_video2video = gr.Interface(
|
|
| 257 |
title="Text + Video → Lip-Sync"
|
| 258 |
)
|
| 259 |
|
| 260 |
-
#
|
| 261 |
demo = gr.TabbedInterface(
|
| 262 |
[text2audio, video2audio, audio2video, text_video2video],
|
| 263 |
["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
|
| 264 |
-
).queue(
|
| 265 |
|
| 266 |
-
|
| 267 |
-
|
|
|
|
| 4 |
import tempfile
|
| 5 |
import json
|
| 6 |
import inspect
|
| 7 |
+
import shutil
|
| 8 |
|
| 9 |
import torch
|
| 10 |
import gradio as gr
|
|
|
|
| 12 |
from omegaconf import OmegaConf
|
| 13 |
from diffusers import AutoencoderKL, DDIMScheduler
|
| 14 |
|
| 15 |
+
# ─── 0. Chuyển CWD & thiết lập PYTHONPATH ───────────────────────────
|
| 16 |
BASE_DIR = os.path.dirname(__file__)
|
| 17 |
+
|
| 18 |
+
# Chuyển working dir vào LatentSync để các đường dẫn relative bên trong đúng
|
| 19 |
os.chdir(os.path.join(BASE_DIR, "LatentSync"))
|
| 20 |
|
| 21 |
+
# Copy mask.png từ assets → latentsync/utils nếu cần
|
| 22 |
+
assets_mask = os.path.join("assets", "mask.png")
|
| 23 |
+
utils_mask = os.path.join("latentsync", "utils", "mask.png")
|
| 24 |
+
if os.path.exists(assets_mask) and not os.path.exists(utils_mask):
|
| 25 |
+
shutil.copy(assets_mask, utils_mask)
|
| 26 |
+
|
| 27 |
+
# Thêm Long_Tieng và LatentSync vào sys.path để import modules
|
| 28 |
sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
|
| 29 |
sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
|
| 30 |
|
|
|
|
| 47 |
setup_eval_logging()
|
| 48 |
net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
|
| 49 |
net.load_weights(torch.load(
|
| 50 |
+
mma_cfg.model_path, map_location=device, weights_only=True
|
|
|
|
|
|
|
| 51 |
))
|
| 52 |
feature_utils = FeaturesUtils(
|
| 53 |
tod_vae_ckpt=mma_cfg.vae_path,
|
|
|
|
| 115 |
return out_video
|
| 116 |
|
| 117 |
# ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
|
| 118 |
+
# 2.1 Download checkpoints
|
| 119 |
REPO_ID = "LTTEAM/Nhep_Mieng"
|
| 120 |
ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
|
| 121 |
os.makedirs(ckpt_dir, exist_ok=True)
|
| 122 |
snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
|
| 123 |
|
| 124 |
+
# 2.2 Load U-Net config
|
| 125 |
cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
|
| 126 |
conf = OmegaConf.load(cfg_path)
|
| 127 |
|
| 128 |
+
# 2.3 Load scheduler config locally + filter invalid args
|
| 129 |
sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
|
| 130 |
with open(sched_path, "r") as f:
|
| 131 |
sched_cfg = json.load(f)
|
|
|
|
| 133 |
init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
|
| 134 |
scheduler = DDIMScheduler(**init_cfg)
|
| 135 |
|
| 136 |
+
# 2.4 Load VAE and fix missing shift_factor
|
| 137 |
vae = AutoencoderKL.from_pretrained(
|
| 138 |
"stabilityai/sd-vae-ft-mse",
|
| 139 |
torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
|
|
|
|
| 141 |
if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
|
| 142 |
vae.config.shift_factor = 0.0
|
| 143 |
|
| 144 |
+
# 2.5 Whisper audio encoder
|
| 145 |
from latentsync.whisper.audio2feature import Audio2Feature
|
| 146 |
dim = conf.model.cross_attention_dim
|
| 147 |
+
wh = "small.pt" if dim == 768 else "tiny.pt"
|
| 148 |
audio_encoder = Audio2Feature(
|
| 149 |
+
model_path=os.path.join(ckpt_dir, "whisper", wh),
|
| 150 |
device=device,
|
| 151 |
num_frames=conf.data.num_frames
|
| 152 |
)
|
| 153 |
|
| 154 |
+
# 2.6 Load UNet3DConditionModel
|
| 155 |
from latentsync.models.unet import UNet3DConditionModel
|
| 156 |
unet, _ = UNet3DConditionModel.from_pretrained(
|
| 157 |
OmegaConf.to_container(conf.model),
|
|
|
|
| 160 |
)
|
| 161 |
unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
|
| 162 |
|
| 163 |
+
# 2.7 Build LipsyncPipeline
|
| 164 |
from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
|
| 165 |
pipe_sync = LipsyncPipeline(
|
| 166 |
vae=vae,
|
|
|
|
| 176 |
|
| 177 |
out_id = uuid.uuid4().hex
|
| 178 |
result = f"lipsync_{out_id}.mp4"
|
|
|
|
| 179 |
try:
|
| 180 |
pipe_sync(
|
| 181 |
video_path=video_path,
|
|
|
|
| 191 |
)
|
| 192 |
except RuntimeError as e:
|
| 193 |
if "Face not detected" in str(e):
|
| 194 |
+
raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video có khuôn mặt rõ ràng.")
|
| 195 |
else:
|
| 196 |
raise
|
| 197 |
return result
|
|
|
|
| 262 |
title="Text + Video → Lip-Sync"
|
| 263 |
)
|
| 264 |
|
| 265 |
+
# Tạo tabbed interface và bật queue (mặc định)
|
| 266 |
demo = gr.TabbedInterface(
|
| 267 |
[text2audio, video2audio, audio2video, text_video2video],
|
| 268 |
["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
|
| 269 |
+
).queue()
|
| 270 |
|
| 271 |
+
# Launch với share=True
|
| 272 |
+
demo.launch(share=True)
|