testSpeech / app_gradio.py

Upload folder using huggingface_hub

e3c2b9c verified 10 months ago

5.65 kB

	import gradio as gr
	import os
	import json
	import math
	import torch
	from torch import nn
	from torch.nn import functional as F
	import librosa
	import argparse
	import librosa.display
	import matplotlib.pyplot as plt


	import commons
	import utils
	from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
	#from model_old_mel_style import SynthesizerTrn
	from models_mel_style import SynthesizerTrn
	from text.symbols import symbols
	from text import text_to_sequence
	from mel_processing import spectrogram_torch, spec_to_mel_torch
	from scipy.io.wavfile import write

	# Thư mục chứa các file wav
	AUDIO_DIR = "wav/wav_1"

	def list_wav_files():
	return [f for f in os.listdir(AUDIO_DIR) if f.endswith(".wav")]

	# Trả về đường dẫn file wav được chọn
	def get_audio_file(file_name):
	file_path = os.path.join(AUDIO_DIR, file_name)
	return file_path

	def get_text(text, hps):
	text_norm = text_to_sequence(text, hps.data.text_cleaners)
	if hps.data.add_blank:
	text_norm = commons.intersperse(text_norm, 0)
	text_norm = torch.LongTensor(text_norm)
	return text_norm

	# Tạo giọng nói bằng mô hình
	def generate_voice(prompt_text, ref_audio_filename):
	import argparse
	class Args:
	checkpoint_path = "logs/large_audio/G_504000.pth"
	config = "configs/vn_base.json"
	save_path = "infer_result/"
	ref_audio = os.path.join("wav/wav_1", ref_audio_filename)
	text = prompt_text
	args = Args()

	hps = utils.get_hparams_from_file(args.config)
	net_g = SynthesizerTrn(
	len(symbols),
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	n_speakers=0,
	**hps.model
	)
	_ = net_g.eval()
	_ = utils.load_checkpoint(args.checkpoint_path, net_g, None)

	audio, _ = librosa.load(args.ref_audio, sr=hps.data.sampling_rate)
	audio = torch.from_numpy(audio).unsqueeze(0)
	spec = spectrogram_torch(audio, hps.data.filter_length, hps.data.sampling_rate,
	hps.data.hop_length, hps.data.win_length, center=False)
	spec = torch.squeeze(spec, 0)
	mel = spec_to_mel_torch(spec, hps.data.filter_length, hps.data.n_mel_channels,
	hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax)

	stn_tst = get_text(args.text, hps)
	x_tst = stn_tst.unsqueeze(0)
	x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
	sid = torch.LongTensor([4])

	with torch.no_grad():
	audio_gen = net_g.infer(x_tst, x_tst_lengths, mel.unsqueeze(0),
	sid=None, noise_scale=0.1,
	noise_scale_w=0.1, length_scale=1.1)[0][0, 0].data.cpu().float().numpy()

	os.makedirs(args.save_path, exist_ok=True)
	output_file = os.path.join(args.save_path, f'test_{str(len(os.listdir(args.save_path)))}.wav')
	write(output_file, hps.data.sampling_rate, audio_gen)

	return output_file

	# with gr.Blocks() as demo:
	# gr.Markdown("<center># <h1>Demo Model Text to Speech</h1></center>")

	# prompt = gr.Textbox(label="Prompt", placeholder="Type somethrgs.ing here...")

	# wav_files = sorted(list_wav_files())
	# if not wav_files:
	# gr.Markdown("⚠️ Không tìm thấy file .wav nào trong thư mục!")

	# gr.Markdown("## 🎧 Chọn và nghe file âm thanh gốc")

	# with gr.Row():
	# file_dropdown = gr.Dropdown(choices=wav_files, label="Chọn file WAV")
	# audio_output = gr.Audio(type="filepath", label="Nghe tại đây")

	# file_dropdown.change(fn=get_audio_file, inputs=file_dropdown, outputs=audio_output)

	# generate_button = gr.Button("Generate Voice")

	# generated_audio_output = gr.Audio(type="filepath", label="🔊 Kết quả sinh giọng nói")
	# generate_button.click(fn=generate_voice, inputs=[prompt, file_dropdown], outputs=generated_audio_output)

	with gr.Blocks() as demo:
	gr.Markdown("<center># <h1>Demo Model Text to Speech</h1></center>")

	prompt = gr.Textbox(label="Prompt", placeholder="Type something here...")

	gr.Markdown("## 🎧 Chọn hoặc ghi âm giọng nói tham chiếu")

	with gr.Tab("📁 Chọn từ file"):
	wav_files = sorted(list_wav_files())
	file_dropdown = gr.Dropdown(choices=wav_files, label="Chọn file WAV có sẵn")
	audio_output = gr.Audio(type="filepath", label="Nghe tại đây")
	file_dropdown.change(fn=get_audio_file, inputs=file_dropdown, outputs=audio_output)

	with gr.Tab("🎙️ Ghi âm mới"):
	recorded_audio = gr.Audio(label="Ghi âm hoặc chọn file", type="filepath")

	# Nút sinh giọng nói
	generate_button = gr.Button("Generate Voice")
	generated_audio_output = gr.Audio(type="filepath", label="🔊 Kết quả sinh giọng nói")

	def process_inputs(prompt_text, file_choice, recorded_path):
	# Nếu người dùng có file ghi âm -> lưu tạm và dùng
	if recorded_path is not None:
	filename = f"user_recording_{len(os.listdir(AUDIO_DIR))}.wav"
	saved_path = os.path.join(AUDIO_DIR, filename)
	os.rename(recorded_path, saved_path)
	ref_file = filename
	elif file_choice:
	ref_file = file_choice
	else:
	raise gr.Error("Bạn cần chọn hoặc ghi âm một file giọng nói.")

	return generate_voice(prompt_text, ref_file)

	generate_button.click(
	fn=process_inputs,
	inputs=[prompt, file_dropdown, recorded_audio],
	outputs=generated_audio_output
	)

	demo.launch()