Spaces:
Paused
Paused
| import logging | |
| from datetime import datetime | |
| from pathlib import Path | |
| import sys | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import os | |
| # Phát hiện Colab | |
| IN_COLAB = "google.colab" in sys.modules | |
| # Tự động chọn device | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # GPU thì bfloat16, CPU thì float32 | |
| dtype = torch.bfloat16 if device.type == "cuda" else torch.float32 | |
| try: | |
| import mmaudio | |
| except ImportError: | |
| os.system("pip install -e .") | |
| import mmaudio | |
| from mmaudio.eval_utils import ( | |
| ModelConfig, all_model_cfg, generate, load_video, make_video, | |
| setup_eval_logging | |
| ) | |
| from mmaudio.model.flow_matching import FlowMatching | |
| from mmaudio.model.networks import MMAudio, get_my_mmaudio | |
| from mmaudio.model.sequence_config import SequenceConfig | |
| from mmaudio.model.utils.features_utils import FeaturesUtils | |
| import tempfile | |
| # Tắt warning về TF32 nếu cần | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| torch.backends.cudnn.allow_tf32 = True | |
| log = logging.getLogger() | |
| # Cấu hình model | |
| model: ModelConfig = all_model_cfg['large_44k_v2'] | |
| model.download_if_needed() | |
| output_dir = Path('./output/gradio') | |
| setup_eval_logging() | |
| def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]: | |
| seq_cfg = model.seq_cfg | |
| # Đưa mạng lên device và dtype | |
| net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval() | |
| net.load_weights( | |
| torch.load(model.model_path, map_location=device, weights_only=True) | |
| ) | |
| log.info(f'Loaded weights from {model.model_path}') | |
| feature_utils = FeaturesUtils( | |
| tod_vae_ckpt=model.vae_path, | |
| synchformer_ckpt=model.synchformer_ckpt, | |
| enable_conditions=True, | |
| mode=model.mode, | |
| bigvgan_vocoder_ckpt=model.bigvgan_16k_path, | |
| need_vae_encoder=False | |
| ).to(device, dtype).eval() | |
| return net, feature_utils, seq_cfg | |
| net, feature_utils, seq_cfg = get_model() | |
| def video_to_audio( | |
| video: gr.Video, prompt: str, negative_prompt: str, seed: int, | |
| num_steps: int, cfg_strength: float, duration: float | |
| ): | |
| rng = torch.Generator(device=device) | |
| if seed >= 0: | |
| rng.manual_seed(seed) | |
| else: | |
| rng.seed() | |
| fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps) | |
| video_info = load_video(video, duration) | |
| clip_frames = video_info.clip_frames.unsqueeze(0) | |
| sync_frames = video_info.sync_frames.unsqueeze(0) | |
| seq_cfg.duration = video_info.duration_sec | |
| net.update_seq_lengths( | |
| seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len | |
| ) | |
| audios = generate( | |
| clip_frames, sync_frames, [prompt], | |
| negative_text=[negative_prompt], | |
| feature_utils=feature_utils, | |
| net=net, fm=fm, rng=rng, cfg_strength=cfg_strength | |
| ) | |
| audio = audios.float().cpu()[0] | |
| video_save_path = tempfile.NamedTemporaryFile( | |
| delete=False, suffix='.mp4' | |
| ).name | |
| make_video(video_info, video_save_path, audio, | |
| sampling_rate=seq_cfg.sampling_rate) | |
| log.info(f'Saved video to {video_save_path}') | |
| return video_save_path | |
| def text_to_audio( | |
| prompt: str, negative_prompt: str, seed: int, | |
| num_steps: int, cfg_strength: float, duration: float | |
| ): | |
| rng = torch.Generator(device=device) | |
| if seed >= 0: | |
| rng.manual_seed(seed) | |
| else: | |
| rng.seed() | |
| fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps) | |
| seq_cfg.duration = duration | |
| net.update_seq_lengths( | |
| seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len | |
| ) | |
| audios = generate( | |
| None, None, [prompt], | |
| negative_text=[negative_prompt], | |
| feature_utils=feature_utils, | |
| net=net, fm=fm, rng=rng, cfg_strength=cfg_strength | |
| ) | |
| audio = audios.float().cpu()[0] | |
| audio_save_path = tempfile.NamedTemporaryFile( | |
| delete=False, suffix='.flac' | |
| ).name | |
| torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate) | |
| log.info(f'Saved audio to {audio_save_path}') | |
| return audio_save_path | |
| # Tab Video → Audio | |
| video_to_audio_tab = gr.Interface( | |
| fn=video_to_audio, | |
| description=""" | |
| Dự án: Lồng âm thanh cho video.<br> | |
| <b>Tác giả:</b> Lý Trần | <b>Cộng đồng:</b> LTTEAM | |
| """, | |
| inputs=[ | |
| gr.Video(label='Video đầu vào'), | |
| gr.Text(label='Lời nhắc (Prompt)'), | |
| gr.Text(label='Lời nhắc tiêu cực', value='music'), | |
| gr.Number(label='Seed (–1: ngẫu nhiên)', value=-1, precision=0, minimum=-1), | |
| gr.Number(label='Số bước (Num steps)', value=25, precision=0, minimum=1), | |
| gr.Number(label='Độ mạnh hướng dẫn (Guidance Strength)', value=4.5, minimum=1), | |
| gr.Number(label='Thời lượng (giây)', value=8, minimum=1), | |
| ], | |
| outputs=gr.Video(label='Video kết quả'), | |
| cache_examples=False, | |
| title='LTTEAM - Lồng tiếng từ video', | |
| ) | |
| # Tab Văn bản → Audio | |
| text_to_audio_tab = gr.Interface( | |
| fn=text_to_audio, | |
| description=""" | |
| Dự án: Lồng âm thanh cho video.<br> | |
| <b>Tác giả:</b> Lý Trần | <b>Cộng đồng:</b> LTTEAM | |
| """, | |
| inputs=[ | |
| gr.Text(label='Lời nhắc (Prompt)'), | |
| gr.Text(label='Lời nhắc tiêu cực'), | |
| gr.Number(label='Seed (–1: ngẫu nhiên)', value=-1, precision=0, minimum=-1), | |
| gr.Number(label='Số bước (Num steps)', value=25, precision=0, minimum=1), | |
| gr.Number(label='Độ mạnh hướng dẫn (Guidance Strength)', value=4.5, minimum=1), | |
| gr.Number(label='Thời lượng (giây)', value=8, minimum=1), | |
| ], | |
| outputs=gr.Audio(label='Âm thanh kết quả'), | |
| cache_examples=False, | |
| title='LTTEAM - Lồng tiếng từ video', | |
| ) | |
| if __name__ == "__main__": | |
| gr.TabbedInterface( | |
| [video_to_audio_tab, text_to_audio_tab], | |
| ['Video thành Âm thanh', 'Văn bản thành Âm thanh'] | |
| ).launch(share=True) | |