File size: 6,026 Bytes
8164907
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6688a4
8164907
 
 
a6688a4
 
8164907
 
a6688a4
 
 
 
 
 
 
8164907
a6688a4
8164907
a6688a4
8164907
 
a6688a4
8164907
 
a6688a4
 
 
 
8164907
a6688a4
 
 
 
 
 
8164907
a6688a4
8164907
a6688a4
8164907
 
 
 
 
a6688a4
038d4d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import logging
from datetime import datetime
from pathlib import Path
import sys

import gradio as gr
import torch
import torchaudio
import os

# Phát hiện Colab
IN_COLAB = "google.colab" in sys.modules

# Tự động chọn device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# GPU thì bfloat16, CPU thì float32
dtype = torch.bfloat16 if device.type == "cuda" else torch.float32

try:
    import mmaudio
except ImportError:
    os.system("pip install -e .")
    import mmaudio

from mmaudio.eval_utils import (
    ModelConfig, all_model_cfg, generate, load_video, make_video,
    setup_eval_logging
)
from mmaudio.model.flow_matching import FlowMatching
from mmaudio.model.networks import MMAudio, get_my_mmaudio
from mmaudio.model.sequence_config import SequenceConfig
from mmaudio.model.utils.features_utils import FeaturesUtils
import tempfile

# Tắt warning về TF32 nếu cần
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

log = logging.getLogger()

# Cấu hình model
model: ModelConfig = all_model_cfg['large_44k_v2']
model.download_if_needed()
output_dir = Path('./output/gradio')
setup_eval_logging()

def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
    seq_cfg = model.seq_cfg

    # Đưa mạng lên device và dtype
    net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
    net.load_weights(
        torch.load(model.model_path, map_location=device, weights_only=True)
    )
    log.info(f'Loaded weights from {model.model_path}')

    feature_utils = FeaturesUtils(
        tod_vae_ckpt=model.vae_path,
        synchformer_ckpt=model.synchformer_ckpt,
        enable_conditions=True,
        mode=model.mode,
        bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
        need_vae_encoder=False
    ).to(device, dtype).eval()

    return net, feature_utils, seq_cfg

net, feature_utils, seq_cfg = get_model()

@torch.inference_mode()
def video_to_audio(
    video: gr.Video, prompt: str, negative_prompt: str, seed: int,
    num_steps: int, cfg_strength: float, duration: float
):
    rng = torch.Generator(device=device)
    if seed >= 0:
        rng.manual_seed(seed)
    else:
        rng.seed()

    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
    video_info = load_video(video, duration)
    clip_frames = video_info.clip_frames.unsqueeze(0)
    sync_frames = video_info.sync_frames.unsqueeze(0)
    seq_cfg.duration = video_info.duration_sec
    net.update_seq_lengths(
        seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len
    )

    audios = generate(
        clip_frames, sync_frames, [prompt],
        negative_text=[negative_prompt],
        feature_utils=feature_utils,
        net=net, fm=fm, rng=rng, cfg_strength=cfg_strength
    )
    audio = audios.float().cpu()[0]

    video_save_path = tempfile.NamedTemporaryFile(
        delete=False, suffix='.mp4'
    ).name
    make_video(video_info, video_save_path, audio,
               sampling_rate=seq_cfg.sampling_rate)
    log.info(f'Saved video to {video_save_path}')
    return video_save_path

@torch.inference_mode()
def text_to_audio(
    prompt: str, negative_prompt: str, seed: int,
    num_steps: int, cfg_strength: float, duration: float
):
    rng = torch.Generator(device=device)
    if seed >= 0:
        rng.manual_seed(seed)
    else:
        rng.seed()

    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
    seq_cfg.duration = duration
    net.update_seq_lengths(
        seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len
    )

    audios = generate(
        None, None, [prompt],
        negative_text=[negative_prompt],
        feature_utils=feature_utils,
        net=net, fm=fm, rng=rng, cfg_strength=cfg_strength
    )
    audio = audios.float().cpu()[0]

    audio_save_path = tempfile.NamedTemporaryFile(
        delete=False, suffix='.flac'
    ).name
    torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
    log.info(f'Saved audio to {audio_save_path}')
    return audio_save_path

# Tab Video → Audio
video_to_audio_tab = gr.Interface(
    fn=video_to_audio,
    description="""
     Dự án: Lồng âm thanh cho video.<br>
    <b>Tác giả:</b> Lý Trần &nbsp;|&nbsp; <b>Cộng đồng:</b> LTTEAM
    """,
    inputs=[
        gr.Video(label='Video đầu vào'),
        gr.Text(label='Lời nhắc (Prompt)'),
        gr.Text(label='Lời nhắc tiêu cực', value='music'),
        gr.Number(label='Seed (–1: ngẫu nhiên)', value=-1, precision=0, minimum=-1),
        gr.Number(label='Số bước (Num steps)', value=25, precision=0, minimum=1),
        gr.Number(label='Độ mạnh hướng dẫn (Guidance Strength)', value=4.5, minimum=1),
        gr.Number(label='Thời lượng (giây)', value=8, minimum=1),
    ],
    outputs=gr.Video(label='Video kết quả'),
    cache_examples=False,
    title='LTTEAM - Lồng tiếng từ video',
)

# Tab Văn bản → Audio
text_to_audio_tab = gr.Interface(
    fn=text_to_audio,
    description="""
    Dự án: Lồng âm thanh cho video.<br>
    <b>Tác giả:</b> Lý Trần &nbsp;|&nbsp; <b>Cộng đồng:</b> LTTEAM
    """,
    inputs=[
        gr.Text(label='Lời nhắc (Prompt)'),
        gr.Text(label='Lời nhắc tiêu cực'),
        gr.Number(label='Seed (–1: ngẫu nhiên)', value=-1, precision=0, minimum=-1),
        gr.Number(label='Số bước (Num steps)', value=25, precision=0, minimum=1),
        gr.Number(label='Độ mạnh hướng dẫn (Guidance Strength)', value=4.5, minimum=1),
        gr.Number(label='Thời lượng (giây)', value=8, minimum=1),
    ],
    outputs=gr.Audio(label='Âm thanh kết quả'),
    cache_examples=False,
    title='LTTEAM - Lồng tiếng từ video',
)

if __name__ == "__main__":
    gr.TabbedInterface(
        [video_to_audio_tab, text_to_audio_tab],
        ['Video thành Âm thanh', 'Văn bản thành Âm thanh']
    ).launch(share=True)