testSpeech / app_gradio.py
CongBang's picture
Upload folder using huggingface_hub
e3c2b9c verified
import gradio as gr
import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
import librosa
import argparse
import librosa.display
import matplotlib.pyplot as plt
import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
#from model_old_mel_style import SynthesizerTrn
from models_mel_style import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from mel_processing import spectrogram_torch, spec_to_mel_torch
from scipy.io.wavfile import write
# Thư mục chứa các file wav
AUDIO_DIR = "wav/wav_1"
def list_wav_files():
return [f for f in os.listdir(AUDIO_DIR) if f.endswith(".wav")]
# Trả về đường dẫn file wav được chọn
def get_audio_file(file_name):
file_path = os.path.join(AUDIO_DIR, file_name)
return file_path
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
# Tạo giọng nói bằng mô hình
def generate_voice(prompt_text, ref_audio_filename):
import argparse
class Args:
checkpoint_path = "logs/large_audio/G_504000.pth"
config = "configs/vn_base.json"
save_path = "infer_result/"
ref_audio = os.path.join("wav/wav_1", ref_audio_filename)
text = prompt_text
args = Args()
hps = utils.get_hparams_from_file(args.config)
net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=0,
**hps.model
)
_ = net_g.eval()
_ = utils.load_checkpoint(args.checkpoint_path, net_g, None)
audio, _ = librosa.load(args.ref_audio, sr=hps.data.sampling_rate)
audio = torch.from_numpy(audio).unsqueeze(0)
spec = spectrogram_torch(audio, hps.data.filter_length, hps.data.sampling_rate,
hps.data.hop_length, hps.data.win_length, center=False)
spec = torch.squeeze(spec, 0)
mel = spec_to_mel_torch(spec, hps.data.filter_length, hps.data.n_mel_channels,
hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax)
stn_tst = get_text(args.text, hps)
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
sid = torch.LongTensor([4])
with torch.no_grad():
audio_gen = net_g.infer(x_tst, x_tst_lengths, mel.unsqueeze(0),
sid=None, noise_scale=0.1,
noise_scale_w=0.1, length_scale=1.1)[0][0, 0].data.cpu().float().numpy()
os.makedirs(args.save_path, exist_ok=True)
output_file = os.path.join(args.save_path, f'test_{str(len(os.listdir(args.save_path)))}.wav')
write(output_file, hps.data.sampling_rate, audio_gen)
return output_file
# with gr.Blocks() as demo:
# gr.Markdown("<center># <h1>Demo Model Text to Speech</h1></center>")
# prompt = gr.Textbox(label="Prompt", placeholder="Type somethrgs.ing here...")
# wav_files = sorted(list_wav_files())
# if not wav_files:
# gr.Markdown("⚠️ Không tìm thấy file .wav nào trong thư mục!")
# gr.Markdown("## 🎧 Chọn và nghe file âm thanh gốc")
# with gr.Row():
# file_dropdown = gr.Dropdown(choices=wav_files, label="Chọn file WAV")
# audio_output = gr.Audio(type="filepath", label="Nghe tại đây")
# file_dropdown.change(fn=get_audio_file, inputs=file_dropdown, outputs=audio_output)
# generate_button = gr.Button("Generate Voice")
# generated_audio_output = gr.Audio(type="filepath", label="🔊 Kết quả sinh giọng nói")
# generate_button.click(fn=generate_voice, inputs=[prompt, file_dropdown], outputs=generated_audio_output)
with gr.Blocks() as demo:
gr.Markdown("<center># <h1>Demo Model Text to Speech</h1></center>")
prompt = gr.Textbox(label="Prompt", placeholder="Type something here...")
gr.Markdown("## 🎧 Chọn hoặc ghi âm giọng nói tham chiếu")
with gr.Tab("📁 Chọn từ file"):
wav_files = sorted(list_wav_files())
file_dropdown = gr.Dropdown(choices=wav_files, label="Chọn file WAV có sẵn")
audio_output = gr.Audio(type="filepath", label="Nghe tại đây")
file_dropdown.change(fn=get_audio_file, inputs=file_dropdown, outputs=audio_output)
with gr.Tab("🎙️ Ghi âm mới"):
recorded_audio = gr.Audio(label="Ghi âm hoặc chọn file", type="filepath")
# Nút sinh giọng nói
generate_button = gr.Button("Generate Voice")
generated_audio_output = gr.Audio(type="filepath", label="🔊 Kết quả sinh giọng nói")
def process_inputs(prompt_text, file_choice, recorded_path):
# Nếu người dùng có file ghi âm -> lưu tạm và dùng
if recorded_path is not None:
filename = f"user_recording_{len(os.listdir(AUDIO_DIR))}.wav"
saved_path = os.path.join(AUDIO_DIR, filename)
os.rename(recorded_path, saved_path)
ref_file = filename
elif file_choice:
ref_file = file_choice
else:
raise gr.Error("Bạn cần chọn hoặc ghi âm một file giọng nói.")
return generate_voice(prompt_text, ref_file)
generate_button.click(
fn=process_inputs,
inputs=[prompt, file_dropdown, recorded_audio],
outputs=generated_audio_output
)
demo.launch()