Update app.py
Browse files
app.py
CHANGED
|
@@ -11,12 +11,12 @@ from huggingface_hub import snapshot_download
|
|
| 11 |
from omegaconf import OmegaConf
|
| 12 |
from diffusers import AutoencoderKL, DDIMScheduler
|
| 13 |
|
| 14 |
-
# ─── 0.
|
| 15 |
BASE_DIR = os.path.dirname(__file__)
|
| 16 |
-
#
|
| 17 |
os.chdir(os.path.join(BASE_DIR, "LatentSync"))
|
| 18 |
|
| 19 |
-
#
|
| 20 |
sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
|
| 21 |
sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
|
| 22 |
|
|
@@ -109,17 +109,17 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
|
|
| 109 |
return out_video
|
| 110 |
|
| 111 |
# ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
|
| 112 |
-
# 2.1
|
| 113 |
REPO_ID = "LTTEAM/Nhep_Mieng"
|
| 114 |
ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
|
| 115 |
os.makedirs(ckpt_dir, exist_ok=True)
|
| 116 |
snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
|
| 117 |
|
| 118 |
-
# 2.2
|
| 119 |
cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
|
| 120 |
conf = OmegaConf.load(cfg_path)
|
| 121 |
|
| 122 |
-
# 2.3
|
| 123 |
sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
|
| 124 |
with open(sched_path, "r") as f:
|
| 125 |
sched_cfg = json.load(f)
|
|
@@ -127,26 +127,25 @@ valid_args = inspect.signature(DDIMScheduler.__init__).parameters.keys()
|
|
| 127 |
init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
|
| 128 |
scheduler = DDIMScheduler(**init_cfg)
|
| 129 |
|
| 130 |
-
# 2.4
|
| 131 |
vae = AutoencoderKL.from_pretrained(
|
| 132 |
"stabilityai/sd-vae-ft-mse",
|
| 133 |
torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
|
| 134 |
)
|
| 135 |
-
# Một số VAE config thiếu shift_factor => default về 0.0
|
| 136 |
if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
|
| 137 |
vae.config.shift_factor = 0.0
|
| 138 |
|
| 139 |
-
# 2.5 Whisper
|
| 140 |
from latentsync.whisper.audio2feature import Audio2Feature
|
| 141 |
dim = conf.model.cross_attention_dim
|
| 142 |
-
|
| 143 |
audio_encoder = Audio2Feature(
|
| 144 |
-
model_path=os.path.join(ckpt_dir, "whisper",
|
| 145 |
device=device,
|
| 146 |
num_frames=conf.data.num_frames
|
| 147 |
)
|
| 148 |
|
| 149 |
-
# 2.6
|
| 150 |
from latentsync.models.unet import UNet3DConditionModel
|
| 151 |
unet, _ = UNet3DConditionModel.from_pretrained(
|
| 152 |
OmegaConf.to_container(conf.model),
|
|
@@ -155,7 +154,7 @@ unet, _ = UNet3DConditionModel.from_pretrained(
|
|
| 155 |
)
|
| 156 |
unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
|
| 157 |
|
| 158 |
-
# 2.7
|
| 159 |
from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
|
| 160 |
pipe_sync = LipsyncPipeline(
|
| 161 |
vae=vae,
|
|
@@ -171,7 +170,7 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
|
|
| 171 |
|
| 172 |
out_id = uuid.uuid4().hex
|
| 173 |
result = f"lipsync_{out_id}.mp4"
|
| 174 |
-
|
| 175 |
try:
|
| 176 |
pipe_sync(
|
| 177 |
video_path=video_path,
|
|
@@ -187,7 +186,7 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, inference_steps):
|
|
| 187 |
)
|
| 188 |
except RuntimeError as e:
|
| 189 |
if "Face not detected" in str(e):
|
| 190 |
-
raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video
|
| 191 |
else:
|
| 192 |
raise
|
| 193 |
return result
|
|
@@ -258,7 +257,11 @@ text_video2video = gr.Interface(
|
|
| 258 |
title="Text + Video → Lip-Sync"
|
| 259 |
)
|
| 260 |
|
| 261 |
-
|
|
|
|
| 262 |
[text2audio, video2audio, audio2video, text_video2video],
|
| 263 |
["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
|
| 264 |
-
).
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
from omegaconf import OmegaConf
|
| 12 |
from diffusers import AutoencoderKL, DDIMScheduler
|
| 13 |
|
| 14 |
+
# ─── 0. Thiết lập Working Directory & PYTHONPATH ────────────────────
|
| 15 |
BASE_DIR = os.path.dirname(__file__)
|
| 16 |
+
# 0.1 chuyển CWD vào LatentSync để tất cả đường dẫn relative nội bộ (mask, configs…) đúng
|
| 17 |
os.chdir(os.path.join(BASE_DIR, "LatentSync"))
|
| 18 |
|
| 19 |
+
# 0.2 thêm Long_Tieng và LatentSync vào sys.path để import modules
|
| 20 |
sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
|
| 21 |
sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
|
| 22 |
|
|
|
|
| 109 |
return out_video
|
| 110 |
|
| 111 |
# ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
|
| 112 |
+
# 2.1 tải checkpoints về local
|
| 113 |
REPO_ID = "LTTEAM/Nhep_Mieng"
|
| 114 |
ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
|
| 115 |
os.makedirs(ckpt_dir, exist_ok=True)
|
| 116 |
snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
|
| 117 |
|
| 118 |
+
# 2.2 load U-Net config
|
| 119 |
cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
|
| 120 |
conf = OmegaConf.load(cfg_path)
|
| 121 |
|
| 122 |
+
# 2.3 load scheduler từ config local, lọc bỏ các khóa không hợp lệ
|
| 123 |
sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
|
| 124 |
with open(sched_path, "r") as f:
|
| 125 |
sched_cfg = json.load(f)
|
|
|
|
| 127 |
init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
|
| 128 |
scheduler = DDIMScheduler(**init_cfg)
|
| 129 |
|
| 130 |
+
# 2.4 load VAE và đảm bảo có shift_factor
|
| 131 |
vae = AutoencoderKL.from_pretrained(
|
| 132 |
"stabilityai/sd-vae-ft-mse",
|
| 133 |
torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
|
| 134 |
)
|
|
|
|
| 135 |
if not hasattr(vae.config, "shift_factor") or vae.config.shift_factor is None:
|
| 136 |
vae.config.shift_factor = 0.0
|
| 137 |
|
| 138 |
+
# 2.5 load Whisper encoder
|
| 139 |
from latentsync.whisper.audio2feature import Audio2Feature
|
| 140 |
dim = conf.model.cross_attention_dim
|
| 141 |
+
wp = "small.pt" if dim == 768 else "tiny.pt"
|
| 142 |
audio_encoder = Audio2Feature(
|
| 143 |
+
model_path=os.path.join(ckpt_dir, "whisper", wp),
|
| 144 |
device=device,
|
| 145 |
num_frames=conf.data.num_frames
|
| 146 |
)
|
| 147 |
|
| 148 |
+
# 2.6 load UNet3DConditionModel
|
| 149 |
from latentsync.models.unet import UNet3DConditionModel
|
| 150 |
unet, _ = UNet3DConditionModel.from_pretrained(
|
| 151 |
OmegaConf.to_container(conf.model),
|
|
|
|
| 154 |
)
|
| 155 |
unet = unet.to(torch.float16) if device.type == "cuda" else unet.to(torch.float32)
|
| 156 |
|
| 157 |
+
# 2.7 build lipsync pipeline
|
| 158 |
from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
|
| 159 |
pipe_sync = LipsyncPipeline(
|
| 160 |
vae=vae,
|
|
|
|
| 170 |
|
| 171 |
out_id = uuid.uuid4().hex
|
| 172 |
result = f"lipsync_{out_id}.mp4"
|
| 173 |
+
# bắt lỗi face not detected
|
| 174 |
try:
|
| 175 |
pipe_sync(
|
| 176 |
video_path=video_path,
|
|
|
|
| 186 |
)
|
| 187 |
except RuntimeError as e:
|
| 188 |
if "Face not detected" in str(e):
|
| 189 |
+
raise ValueError("Không phát hiện khuôn mặt trong video. Vui lòng chọn video rõ ràng.")
|
| 190 |
else:
|
| 191 |
raise
|
| 192 |
return result
|
|
|
|
| 257 |
title="Text + Video → Lip-Sync"
|
| 258 |
)
|
| 259 |
|
| 260 |
+
# Tabbed interface với queue để cho phép chạy lâu (timeout=3600s)
|
| 261 |
+
demo = gr.TabbedInterface(
|
| 262 |
[text2audio, video2audio, audio2video, text_video2video],
|
| 263 |
["Text→Audio","Video→Audio","Audio→LipSync","Text+Video→LipSync"]
|
| 264 |
+
).queue(request_timeout=3600)
|
| 265 |
+
|
| 266 |
+
if __name__ == "__main__":
|
| 267 |
+
demo.launch(share=True)
|