Spaces:

LTTEAM
/

Veo3Audio

Paused

App Files Files Community

LTTEAM commited on Jul 2, 2025

Commit

a9e5a39

verified ·

1 Parent(s): 185f811

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -42

app.py CHANGED Viewed

@@ -1,20 +1,19 @@
 import os
 import sys
 import uuid
 import tempfile
 import torch
 import gradio as gr
 from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
 from diffusers import AutoencoderKL, DDIMScheduler
-# ——————————————————————————
-# Cho Python “nhìn” vào 2 thư mục con
 BASE_DIR = os.path.dirname(__file__)
 sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
 sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
 # === MMAUDIO (Long_Tieng) setup ===
 from mmaudio.eval_utils import (
@@ -28,16 +27,14 @@ from mmaudio.model.utils.features_utils import FeaturesUtils
 from mmaudio.model.networks import MMAudio, get_my_mmaudio
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-dtype = torch.bfloat16 if device.type=="cuda" else torch.float32
-# Load MMAudio model
-model: ModelConfig = all_model_cfg["large_44k_v2"]
 model.download_if_needed()
 setup_eval_logging()
 net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
-net.load_weights(
-    torch.load(model.model_path, map_location=device, weights_only=True)
-)
 feature_utils = FeaturesUtils(
     tod_vae_ckpt=model.vae_path,
     synchformer_ckpt=model.synchformer_ckpt,
@@ -51,11 +48,8 @@ seq_cfg: SequenceConfig = model.seq_cfg
 @torch.inference_mode()
 def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
     rng = torch.Generator(device=device)
-    if seed >= 0:
-        rng.manual_seed(seed)
-    else:
-        rng.seed()
-    fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
     seq_cfg.duration = duration
     net.update_seq_lengths(
         seq_cfg.latent_seq_len,
@@ -76,7 +70,6 @@ def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
 @torch.inference_mode()
 def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, duration):
-    # Từ Long_Tieng eval_utils
     from mmaudio.eval_utils import load_video, make_video
     from mmaudio.model.flow_matching import FlowMatching
@@ -85,11 +78,8 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
     sync = video_info.sync_frames.unsqueeze(0)
     rng = torch.Generator(device=device)
-    if seed >= 0:
-        rng.manual_seed(seed)
-    else:
-        rng.seed()
-    fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
     seq_cfg.duration = video_info.duration_sec
     net.update_seq_lengths(
@@ -104,13 +94,13 @@ def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, dura
         net=net, fm=fm, rng=rng, cfg_strength=guidance
     )
     audio = audios.float().cpu()[0]
-    out_vid = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
-    make_video(video_info, out_vid, audio, sampling_rate=seq_cfg.sampling_rate)
-    return out_vid
 # === LATENTSYNC setup ===
 REPO_ID = "LTTEAM/Nhep_Mieng"
-snapshot_download(repo_id=REPO_ID, local_dir="checkpoints", allow_patterns=["*.pt"])
 conf = OmegaConf.load("configs/unet/second_stage.yaml")
 vae = AutoencoderKL.from_pretrained(
@@ -138,11 +128,7 @@ unet, _ = UNet3DConditionModel.from_pretrained(
     "checkpoints/latentsync_unet.pt",
     device=device
 )
-unet = (
-    unet.to(dtype=torch.float16)
-    if device=="cuda" else
-    unet.to(dtype=torch.float32)
-)
 from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
 pipe_sync = LipsyncPipeline(
@@ -153,13 +139,9 @@ pipe_sync = LipsyncPipeline(
 ).to(device)
 def lipsync_fn(video_path, audio_path, seed, num_frames, steps):
-    # Từ LatentSync pipelines
     from accelerate.utils import set_seed
     if seed >= 0:
         set_seed(seed)
-    else:
-        torch.seed()
     out_id = uuid.uuid4().hex
     out_path = f"out_{out_id}.mp4"
     pipe_sync(
@@ -176,7 +158,7 @@ def lipsync_fn(video_path, audio_path, seed, num_frames, steps):
     )
     return out_path
-# === Gradio UI ===
 text2audio = gr.Interface(
     fn=text_to_audio_fn,
     inputs=[
@@ -185,7 +167,7 @@ text2audio = gr.Interface(
         gr.Number(label="Seed", value=-1, precision=0),
         gr.Number(label="Num Steps", value=25, precision=0),
         gr.Number(label="Guidance Strength", value=4.5),
-        gr.Number(label="Duration (s)", value=8)
     ],
     outputs=gr.Audio(label="Generated Audio"),
     title="Text → Audio"
@@ -200,9 +182,9 @@ video2audio = gr.Interface(
         gr.Number(label="Seed", value=-1, precision=0),
         gr.Number(label="Num Steps", value=25, precision=0),
         gr.Number(label="Guidance Strength", value=4.5),
-        gr.Number(label="Duration (s)", value=8)
     ],
-    outputs=gr.Video(label="Video + Audio"),
     title="Video → Audio"
 )
@@ -213,7 +195,7 @@ audio2video = gr.Interface(
         gr.Audio(label="Input Audio", type="filepath"),
         gr.Number(label="Seed", value=-1, precision=0),
         gr.Number(label="Num Frames", value=16, precision=0),
-        gr.Number(label="Inference Steps", value=50, precision=0)
     ],
     outputs=gr.Video(label="Lip-Synced Video"),
     title="Audio → Lip-Sync"
@@ -221,8 +203,8 @@ audio2video = gr.Interface(
 text_video2video = gr.Interface(
     fn=lambda p,np,sd,ns,gs,du,vid,nf,st: (
-        text_to_audio_fn(p, np, sd, ns, gs, du),
-        lipsync_fn(vid, text_to_audio_fn(p, np, sd, ns, gs, du), sd, nf, st)
     ),
     inputs=[
         gr.Textbox(label="Prompt"),
@@ -233,7 +215,7 @@ text_video2video = gr.Interface(
         gr.Number(label="Duration (s)", value=8),
         gr.Video(label="Input Video"),
         gr.Number(label="Num Frames", value=16, precision=0),
-        gr.Number(label="Inference Steps", value=50, precision=0)
     ],
     outputs=[gr.Audio(label="Synth Audio"), gr.Video(label="Lip-Synced Video")],
     title="Text + Video → Lip-Sync"

 import os
 import sys
 import uuid
 import tempfile
 import torch
 import gradio as gr
 from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
 from diffusers import AutoencoderKL, DDIMScheduler
+# -------------------------------------------------------------------
+# Thêm path để Python tìm được các package trong Long_Tieng và LatentSync
 BASE_DIR = os.path.dirname(__file__)
 sys.path.insert(0, os.path.join(BASE_DIR, "Long_Tieng"))
 sys.path.insert(0, os.path.join(BASE_DIR, "LatentSync"))
+# -------------------------------------------------------------------
 # === MMAUDIO (Long_Tieng) setup ===
 from mmaudio.eval_utils import (
 from mmaudio.model.networks import MMAudio, get_my_mmaudio
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+dtype = torch.bfloat16 if device.type == "cuda" else torch.float32
+# Load mmaudio model
+model: ModelConfig = all_model_cfg['large_44k_v2']
 model.download_if_needed()
 setup_eval_logging()
 net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
+net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
 feature_utils = FeaturesUtils(
     tod_vae_ckpt=model.vae_path,
     synchformer_ckpt=model.synchformer_ckpt,
 @torch.inference_mode()
 def text_to_audio_fn(prompt, neg_prompt, seed, num_steps, guidance, duration):
     rng = torch.Generator(device=device)
+    if seed >= 0: rng.manual_seed(seed)
+    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
     seq_cfg.duration = duration
     net.update_seq_lengths(
         seq_cfg.latent_seq_len,
 @torch.inference_mode()
 def video_to_audio_fn(video, prompt, neg_prompt, seed, num_steps, guidance, duration):
     from mmaudio.eval_utils import load_video, make_video
     from mmaudio.model.flow_matching import FlowMatching
     sync = video_info.sync_frames.unsqueeze(0)
     rng = torch.Generator(device=device)
+    if seed >= 0: rng.manual_seed(seed)
+    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
     seq_cfg.duration = video_info.duration_sec
     net.update_seq_lengths(
         net=net, fm=fm, rng=rng, cfg_strength=guidance
     )
     audio = audios.float().cpu()[0]
+    video_out = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
+    make_video(video_info, video_out, audio, sampling_rate=seq_cfg.sampling_rate)
+    return video_out
 # === LATENTSYNC setup ===
 REPO_ID = "LTTEAM/Nhep_Mieng"
+snapshot_download(repo_id=REPO_ID, local_dir="checkpoints")
 conf = OmegaConf.load("configs/unet/second_stage.yaml")
 vae = AutoencoderKL.from_pretrained(
     "checkpoints/latentsync_unet.pt",
     device=device
 )
+unet = unet.to(dtype=torch.float16) if device=="cuda" else unet.to(dtype=torch.float32)
 from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
 pipe_sync = LipsyncPipeline(
 ).to(device)
 def lipsync_fn(video_path, audio_path, seed, num_frames, steps):
     from accelerate.utils import set_seed
     if seed >= 0:
         set_seed(seed)
     out_id = uuid.uuid4().hex
     out_path = f"out_{out_id}.mp4"
     pipe_sync(
     )
     return out_path
+# === BUILD GRADIO UI ===
 text2audio = gr.Interface(
     fn=text_to_audio_fn,
     inputs=[
         gr.Number(label="Seed", value=-1, precision=0),
         gr.Number(label="Num Steps", value=25, precision=0),
         gr.Number(label="Guidance Strength", value=4.5),
+        gr.Number(label="Duration (s)", value=8),
     ],
     outputs=gr.Audio(label="Generated Audio"),
     title="Text → Audio"
         gr.Number(label="Seed", value=-1, precision=0),
         gr.Number(label="Num Steps", value=25, precision=0),
         gr.Number(label="Guidance Strength", value=4.5),
+        gr.Number(label="Duration (s)", value=8),
     ],
+    outputs=gr.Video(label="Video with Audio"),
     title="Video → Audio"
 )
         gr.Audio(label="Input Audio", type="filepath"),
         gr.Number(label="Seed", value=-1, precision=0),
         gr.Number(label="Num Frames", value=16, precision=0),
+        gr.Number(label="Inference Steps", value=50, precision=0),
     ],
     outputs=gr.Video(label="Lip-Synced Video"),
     title="Audio → Lip-Sync"
 text_video2video = gr.Interface(
     fn=lambda p,np,sd,ns,gs,du,vid,nf,st: (
+        text_to_audio_fn(p,np,sd,ns,gs,du),
+        lipsync_fn(vid, text_to_audio_fn(p,np,sd,ns,gs,du), sd, nf, st)
     ),
     inputs=[
         gr.Textbox(label="Prompt"),
         gr.Number(label="Duration (s)", value=8),
         gr.Video(label="Input Video"),
         gr.Number(label="Num Frames", value=16, precision=0),
+        gr.Number(label="Inference Steps", value=50, precision=0),
     ],
     outputs=[gr.Audio(label="Synth Audio"), gr.Video(label="Lip-Synced Video")],
     title="Text + Video → Lip-Sync"