Spaces:

LTTEAM
/

Long_Tieng

Paused

App Files Files Community

Long_Tieng / app.py

LTTEAM

Update app.py

a6688a4 verified 6 months ago

raw

history blame contribute delete

6.03 kB

	import logging
	from datetime import datetime
	from pathlib import Path
	import sys

	import gradio as gr
	import torch
	import torchaudio
	import os

	# Phát hiện Colab
	IN_COLAB = "google.colab" in sys.modules

	# Tự động chọn device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# GPU thì bfloat16, CPU thì float32
	dtype = torch.bfloat16 if device.type == "cuda" else torch.float32

	try:
	import mmaudio
	except ImportError:
	os.system("pip install -e .")
	import mmaudio

	from mmaudio.eval_utils import (
	ModelConfig, all_model_cfg, generate, load_video, make_video,
	setup_eval_logging
	)
	from mmaudio.model.flow_matching import FlowMatching
	from mmaudio.model.networks import MMAudio, get_my_mmaudio
	from mmaudio.model.sequence_config import SequenceConfig
	from mmaudio.model.utils.features_utils import FeaturesUtils
	import tempfile

	# Tắt warning về TF32 nếu cần
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True

	log = logging.getLogger()

	# Cấu hình model
	model: ModelConfig = all_model_cfg['large_44k_v2']
	model.download_if_needed()
	output_dir = Path('./output/gradio')
	setup_eval_logging()

	def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
	seq_cfg = model.seq_cfg

	# Đưa mạng lên device và dtype
	net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
	net.load_weights(
	torch.load(model.model_path, map_location=device, weights_only=True)
	)
	log.info(f'Loaded weights from {model.model_path}')

	feature_utils = FeaturesUtils(
	tod_vae_ckpt=model.vae_path,
	synchformer_ckpt=model.synchformer_ckpt,
	enable_conditions=True,
	mode=model.mode,
	bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
	need_vae_encoder=False
	).to(device, dtype).eval()

	return net, feature_utils, seq_cfg

	net, feature_utils, seq_cfg = get_model()

	@torch.inference_mode()
	def video_to_audio(
	video: gr.Video, prompt: str, negative_prompt: str, seed: int,
	num_steps: int, cfg_strength: float, duration: float
	):
	rng = torch.Generator(device=device)
	if seed >= 0:
	rng.manual_seed(seed)
	else:
	rng.seed()

	fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
	video_info = load_video(video, duration)
	clip_frames = video_info.clip_frames.unsqueeze(0)
	sync_frames = video_info.sync_frames.unsqueeze(0)
	seq_cfg.duration = video_info.duration_sec
	net.update_seq_lengths(
	seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len
	)

	audios = generate(
	clip_frames, sync_frames, [prompt],
	negative_text=[negative_prompt],
	feature_utils=feature_utils,
	net=net, fm=fm, rng=rng, cfg_strength=cfg_strength
	)
	audio = audios.float().cpu()[0]

	video_save_path = tempfile.NamedTemporaryFile(
	delete=False, suffix='.mp4'
	).name
	make_video(video_info, video_save_path, audio,
	sampling_rate=seq_cfg.sampling_rate)
	log.info(f'Saved video to {video_save_path}')
	return video_save_path

	@torch.inference_mode()
	def text_to_audio(
	prompt: str, negative_prompt: str, seed: int,
	num_steps: int, cfg_strength: float, duration: float
	):
	rng = torch.Generator(device=device)
	if seed >= 0:
	rng.manual_seed(seed)
	else:
	rng.seed()

	fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
	seq_cfg.duration = duration
	net.update_seq_lengths(
	seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len
	)

	audios = generate(
	None, None, [prompt],
	negative_text=[negative_prompt],
	feature_utils=feature_utils,
	net=net, fm=fm, rng=rng, cfg_strength=cfg_strength
	)
	audio = audios.float().cpu()[0]

	audio_save_path = tempfile.NamedTemporaryFile(
	delete=False, suffix='.flac'
	).name
	torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
	log.info(f'Saved audio to {audio_save_path}')
	return audio_save_path

	# Tab Video → Audio
	video_to_audio_tab = gr.Interface(
	fn=video_to_audio,
	description="""
	Dự án: Lồng âm thanh cho video.<br>
	<b>Tác giả:</b> Lý Trần  \|  <b>Cộng đồng:</b> LTTEAM
	""",
	inputs=[
	gr.Video(label='Video đầu vào'),
	gr.Text(label='Lời nhắc (Prompt)'),
	gr.Text(label='Lời nhắc tiêu cực', value='music'),
	gr.Number(label='Seed (–1: ngẫu nhiên)', value=-1, precision=0, minimum=-1),
	gr.Number(label='Số bước (Num steps)', value=25, precision=0, minimum=1),
	gr.Number(label='Độ mạnh hướng dẫn (Guidance Strength)', value=4.5, minimum=1),
	gr.Number(label='Thời lượng (giây)', value=8, minimum=1),
	],
	outputs=gr.Video(label='Video kết quả'),
	cache_examples=False,
	title='LTTEAM - Lồng tiếng từ video',
	)

	# Tab Văn bản → Audio
	text_to_audio_tab = gr.Interface(
	fn=text_to_audio,
	description="""
	Dự án: Lồng âm thanh cho video.<br>
	<b>Tác giả:</b> Lý Trần  \|  <b>Cộng đồng:</b> LTTEAM
	""",
	inputs=[
	gr.Text(label='Lời nhắc (Prompt)'),
	gr.Text(label='Lời nhắc tiêu cực'),
	gr.Number(label='Seed (–1: ngẫu nhiên)', value=-1, precision=0, minimum=-1),
	gr.Number(label='Số bước (Num steps)', value=25, precision=0, minimum=1),
	gr.Number(label='Độ mạnh hướng dẫn (Guidance Strength)', value=4.5, minimum=1),
	gr.Number(label='Thời lượng (giây)', value=8, minimum=1),
	],
	outputs=gr.Audio(label='Âm thanh kết quả'),
	cache_examples=False,
	title='LTTEAM - Lồng tiếng từ video',
	)

	if __name__ == "__main__":
	gr.TabbedInterface(
	[video_to_audio_tab, text_to_audio_tab],
	['Video thành Âm thanh', 'Văn bản thành Âm thanh']
	).launch(share=True)