LTTEAM commited on
Commit
07d48bb
Β·
verified Β·
1 Parent(s): a7d3d06

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -26
app.py CHANGED
@@ -3,7 +3,6 @@ import sys
3
  import uuid
4
  import tempfile
5
  import json
6
- import shutil
7
  import inspect
8
 
9
  import torch
@@ -12,17 +11,15 @@ from huggingface_hub import snapshot_download
12
  from omegaconf import OmegaConf
13
  from diffusers import AutoencoderKL, DDIMScheduler
14
 
15
- # ─── 0. ThΓͺm Long_Tieng & LatentSync vΓ o PYTHONPATH ────────────────
16
- BASE = os.path.dirname(__file__)
17
- sys.path.insert(0, os.path.join(BASE, "Long_Tieng"))
18
- sys.path.insert(0, os.path.join(BASE, "LatentSync"))
19
 
20
- # ─── 0.1 Copy mask.png vΓ o latentsync/utils ─────────────────────────
21
- src_mask = os.path.join(BASE, "LatentSync", "assets", "mask.png")
22
- dst_utils = os.path.join(BASE, "LatentSync", "latentsync", "utils")
23
- dst_mask = os.path.join(dst_utils, "mask.png")
24
- if os.path.exists(src_mask) and not os.path.exists(dst_mask):
25
- shutil.copy(src_mask, dst_mask)
26
 
27
  # ─── 1. MMAUDIO (Long_Tieng) setup ─────────────────────────────────
28
  from mmaudio.eval_utils import (
@@ -36,14 +33,16 @@ from mmaudio.model.utils.features_utils import FeaturesUtils
36
  from mmaudio.model.networks import MMAudio, get_my_mmaudio
37
 
38
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
39
- dtype = torch.bfloat16 if device.type=="cuda" else torch.float32
40
 
41
  mma_cfg: ModelConfig = all_model_cfg["large_44k_v2"]
42
  mma_cfg.download_if_needed()
43
  setup_eval_logging()
44
  net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
45
  net.load_weights(torch.load(
46
- mma_cfg.model_path, map_location=device, weights_only=True
 
 
47
  ))
48
  feature_utils = FeaturesUtils(
49
  tod_vae_ckpt=mma_cfg.vae_path,
@@ -58,7 +57,8 @@ seq_cfg: SequenceConfig = mma_cfg.seq_cfg
58
  @torch.inference_mode()
59
  def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
60
  rng = torch.Generator(device=device)
61
- if seed >= 0: rng.manual_seed(seed)
 
62
  fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
63
  seq_cfg.duration = duration
64
  net.update_seq_lengths(
@@ -88,7 +88,8 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
88
  sync = info.sync_frames.unsqueeze(0)
89
 
90
  rng = torch.Generator(device=device)
91
- if seed >= 0: rng.manual_seed(seed)
 
92
  fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
93
 
94
  seq_cfg.duration = info.duration_sec
@@ -109,30 +110,31 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
109
  return out_video
110
 
111
  # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
 
112
  REPO_ID = "LTTEAM/Nhep_Mieng"
113
- ckpt_dir = os.path.join(BASE, "checkpoints")
114
  os.makedirs(ckpt_dir, exist_ok=True)
115
  snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
116
 
117
- # 2.1 load U-Net config
118
- cfg_path = os.path.join(BASE, "LatentSync", "configs", "unet", "second_stage.yaml")
119
  conf = OmegaConf.load(cfg_path)
120
 
121
- # 2.2 load scheduler from local config, filter out unsupported keys
122
- sched_path = os.path.join(BASE, "LatentSync", "configs", "scheduler_config.json")
123
  with open(sched_path, "r") as f:
124
  sched_cfg = json.load(f)
125
- valid = inspect.signature(DDIMScheduler.__init__).parameters.keys()
126
- init_cfg = {k: v for k, v in sched_cfg.items() if k in valid}
127
  scheduler = DDIMScheduler(**init_cfg)
128
 
129
- # 2.3 load VAE
130
  vae = AutoencoderKL.from_pretrained(
131
  "stabilityai/sd-vae-ft-mse",
132
  torch_dtype=torch.float16 if device.type=="cuda" else torch.float32
133
  )
134
 
135
- # 2.4 load Whisper audio encoder
136
  from latentsync.whisper.audio2feature import Audio2Feature
137
  dim = conf.model.cross_attention_dim
138
  wp = "small.pt" if dim == 768 else "tiny.pt"
@@ -142,7 +144,7 @@ audio_encoder = Audio2Feature(
142
  num_frames=conf.data.num_frames
143
  )
144
 
145
- # 2.5 load UNet3DConditionModel
146
  from latentsync.models.unet import UNet3DConditionModel
147
  unet, _ = UNet3DConditionModel.from_pretrained(
148
  OmegaConf.to_container(conf.model),
@@ -151,7 +153,7 @@ unet, _ = UNet3DConditionModel.from_pretrained(
151
  )
152
  unet = unet.to(torch.float16) if device.type=="cuda" else unet.to(torch.float32)
153
 
154
- # 2.6 build LipsyncPipeline
155
  from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
156
  pipe_sync = LipsyncPipeline(
157
  vae=vae,
 
3
  import uuid
4
  import tempfile
5
  import json
 
6
  import inspect
7
 
8
  import torch
 
11
  from omegaconf import OmegaConf
12
  from diffusers import AutoencoderKL, DDIMScheduler
13
 
14
+ # ─── 0. ThiαΊΏt lαΊ­p Working Directory & PYTHONPATH ────────────────────
15
+ BASE_DIR = os.path.dirname(__file__)
16
+ # Chuyển CWD vΓ o thΖ° mα»₯c LatentSync để cΓ‘c đường dαΊ«n relative bΓͺn trong LatentSync Δ‘ΓΊng:
17
+ os.chdir(os.path.join(BASE_DIR, "LatentSync"))
18
 
19
+ # Sau khi Δ‘Γ£ chdir, thΓͺm cαΊ£ hai thΖ° mα»₯c Long_Tieng vΓ  LatentSync vΓ o sys.path
20
+ # để Python cΓ³ thể import mmaudio vΓ  latentsync
21
+ sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
22
+ sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
 
 
23
 
24
  # ─── 1. MMAUDIO (Long_Tieng) setup ─────────────────────────────────
25
  from mmaudio.eval_utils import (
 
33
  from mmaudio.model.networks import MMAudio, get_my_mmaudio
34
 
35
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
36
+ dtype = torch.bfloat16 if device.type == "cuda" else torch.float32
37
 
38
  mma_cfg: ModelConfig = all_model_cfg["large_44k_v2"]
39
  mma_cfg.download_if_needed()
40
  setup_eval_logging()
41
  net: MMAudio = get_my_mmaudio(mma_cfg.model_name).to(device, dtype).eval()
42
  net.load_weights(torch.load(
43
+ mma_cfg.model_path,
44
+ map_location=device,
45
+ weights_only=True
46
  ))
47
  feature_utils = FeaturesUtils(
48
  tod_vae_ckpt=mma_cfg.vae_path,
 
57
  @torch.inference_mode()
58
  def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
59
  rng = torch.Generator(device=device)
60
+ if seed >= 0:
61
+ rng.manual_seed(seed)
62
  fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
63
  seq_cfg.duration = duration
64
  net.update_seq_lengths(
 
88
  sync = info.sync_frames.unsqueeze(0)
89
 
90
  rng = torch.Generator(device=device)
91
+ if seed >= 0:
92
+ rng.manual_seed(seed)
93
  fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
94
 
95
  seq_cfg.duration = info.duration_sec
 
110
  return out_video
111
 
112
  # ─── 2. LATENTSYNC setup ─────────────────────────────────────────────
113
+ # 2.1 Download checkpoints về local
114
  REPO_ID = "LTTEAM/Nhep_Mieng"
115
+ ckpt_dir = os.path.join(BASE_DIR, "checkpoints")
116
  os.makedirs(ckpt_dir, exist_ok=True)
117
  snapshot_download(repo_id=REPO_ID, local_dir=ckpt_dir)
118
 
119
+ # 2.2 Load cαΊ₯u hΓ¬nh U-Net
120
+ cfg_path = os.path.join(BASE_DIR, "LatentSync", "configs", "unet", "second_stage.yaml")
121
  conf = OmegaConf.load(cfg_path)
122
 
123
+ # 2.3 Load scheduler config từ local và lọc bỏ cÑc trường không tưƑng thích
124
+ sched_path = os.path.join(BASE_DIR, "LatentSync", "configs", "scheduler_config.json")
125
  with open(sched_path, "r") as f:
126
  sched_cfg = json.load(f)
127
+ valid_args = inspect.signature(DDIMScheduler.__init__).parameters.keys()
128
+ init_cfg = {k: v for k, v in sched_cfg.items() if k in valid_args}
129
  scheduler = DDIMScheduler(**init_cfg)
130
 
131
+ # 2.4 Load VAE
132
  vae = AutoencoderKL.from_pretrained(
133
  "stabilityai/sd-vae-ft-mse",
134
  torch_dtype=torch.float16 if device.type=="cuda" else torch.float32
135
  )
136
 
137
+ # 2.5 Whisper audio encoder
138
  from latentsync.whisper.audio2feature import Audio2Feature
139
  dim = conf.model.cross_attention_dim
140
  wp = "small.pt" if dim == 768 else "tiny.pt"
 
144
  num_frames=conf.data.num_frames
145
  )
146
 
147
+ # 2.6 Load UNet3DConditionModel
148
  from latentsync.models.unet import UNet3DConditionModel
149
  unet, _ = UNet3DConditionModel.from_pretrained(
150
  OmegaConf.to_container(conf.model),
 
153
  )
154
  unet = unet.to(torch.float16) if device.type=="cuda" else unet.to(torch.float32)
155
 
156
+ # 2.7 Build LipsyncPipeline
157
  from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
158
  pipe_sync = LipsyncPipeline(
159
  vae=vae,