|
|
import gradio as gr |
|
|
import os |
|
|
import json |
|
|
import math |
|
|
import torch |
|
|
from torch import nn |
|
|
from torch.nn import functional as F |
|
|
import librosa |
|
|
import argparse |
|
|
import librosa.display |
|
|
import matplotlib.pyplot as plt |
|
|
|
|
|
|
|
|
import commons |
|
|
import utils |
|
|
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate |
|
|
|
|
|
from models_mel_style import SynthesizerTrn |
|
|
from text.symbols import symbols |
|
|
from text import text_to_sequence |
|
|
from mel_processing import spectrogram_torch, spec_to_mel_torch |
|
|
from scipy.io.wavfile import write |
|
|
|
|
|
|
|
|
AUDIO_DIR = "wav/wav_1" |
|
|
|
|
|
def list_wav_files(): |
|
|
return [f for f in os.listdir(AUDIO_DIR) if f.endswith(".wav")] |
|
|
|
|
|
|
|
|
def get_audio_file(file_name): |
|
|
file_path = os.path.join(AUDIO_DIR, file_name) |
|
|
return file_path |
|
|
|
|
|
def get_text(text, hps): |
|
|
text_norm = text_to_sequence(text, hps.data.text_cleaners) |
|
|
if hps.data.add_blank: |
|
|
text_norm = commons.intersperse(text_norm, 0) |
|
|
text_norm = torch.LongTensor(text_norm) |
|
|
return text_norm |
|
|
|
|
|
|
|
|
def generate_voice(prompt_text, ref_audio_filename): |
|
|
import argparse |
|
|
class Args: |
|
|
checkpoint_path = "logs/large_audio/G_504000.pth" |
|
|
config = "configs/vn_base.json" |
|
|
save_path = "infer_result/" |
|
|
ref_audio = os.path.join("wav/wav_1", ref_audio_filename) |
|
|
text = prompt_text |
|
|
args = Args() |
|
|
|
|
|
hps = utils.get_hparams_from_file(args.config) |
|
|
net_g = SynthesizerTrn( |
|
|
len(symbols), |
|
|
hps.data.filter_length // 2 + 1, |
|
|
hps.train.segment_size // hps.data.hop_length, |
|
|
n_speakers=0, |
|
|
**hps.model |
|
|
) |
|
|
_ = net_g.eval() |
|
|
_ = utils.load_checkpoint(args.checkpoint_path, net_g, None) |
|
|
|
|
|
audio, _ = librosa.load(args.ref_audio, sr=hps.data.sampling_rate) |
|
|
audio = torch.from_numpy(audio).unsqueeze(0) |
|
|
spec = spectrogram_torch(audio, hps.data.filter_length, hps.data.sampling_rate, |
|
|
hps.data.hop_length, hps.data.win_length, center=False) |
|
|
spec = torch.squeeze(spec, 0) |
|
|
mel = spec_to_mel_torch(spec, hps.data.filter_length, hps.data.n_mel_channels, |
|
|
hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax) |
|
|
|
|
|
stn_tst = get_text(args.text, hps) |
|
|
x_tst = stn_tst.unsqueeze(0) |
|
|
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) |
|
|
sid = torch.LongTensor([4]) |
|
|
|
|
|
with torch.no_grad(): |
|
|
audio_gen = net_g.infer(x_tst, x_tst_lengths, mel.unsqueeze(0), |
|
|
sid=None, noise_scale=0.1, |
|
|
noise_scale_w=0.1, length_scale=1.1)[0][0, 0].data.cpu().float().numpy() |
|
|
|
|
|
os.makedirs(args.save_path, exist_ok=True) |
|
|
output_file = os.path.join(args.save_path, f'test_{str(len(os.listdir(args.save_path)))}.wav') |
|
|
write(output_file, hps.data.sampling_rate, audio_gen) |
|
|
|
|
|
return output_file |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("<center># <h1>Demo Model Text to Speech</h1></center>") |
|
|
|
|
|
prompt = gr.Textbox(label="Prompt", placeholder="Type something here...") |
|
|
|
|
|
gr.Markdown("## 🎧 Chọn hoặc ghi âm giọng nói tham chiếu") |
|
|
|
|
|
with gr.Tab("📁 Chọn từ file"): |
|
|
wav_files = sorted(list_wav_files()) |
|
|
file_dropdown = gr.Dropdown(choices=wav_files, label="Chọn file WAV có sẵn") |
|
|
audio_output = gr.Audio(type="filepath", label="Nghe tại đây") |
|
|
file_dropdown.change(fn=get_audio_file, inputs=file_dropdown, outputs=audio_output) |
|
|
|
|
|
with gr.Tab("🎙️ Ghi âm mới"): |
|
|
recorded_audio = gr.Audio(label="Ghi âm hoặc chọn file", type="filepath") |
|
|
|
|
|
|
|
|
generate_button = gr.Button("Generate Voice") |
|
|
generated_audio_output = gr.Audio(type="filepath", label="🔊 Kết quả sinh giọng nói") |
|
|
|
|
|
def process_inputs(prompt_text, file_choice, recorded_path): |
|
|
|
|
|
if recorded_path is not None: |
|
|
filename = f"user_recording_{len(os.listdir(AUDIO_DIR))}.wav" |
|
|
saved_path = os.path.join(AUDIO_DIR, filename) |
|
|
os.rename(recorded_path, saved_path) |
|
|
ref_file = filename |
|
|
elif file_choice: |
|
|
ref_file = file_choice |
|
|
else: |
|
|
raise gr.Error("Bạn cần chọn hoặc ghi âm một file giọng nói.") |
|
|
|
|
|
return generate_voice(prompt_text, ref_file) |
|
|
|
|
|
generate_button.click( |
|
|
fn=process_inputs, |
|
|
inputs=[prompt, file_dropdown, recorded_audio], |
|
|
outputs=generated_audio_output |
|
|
) |
|
|
|
|
|
demo.launch() |
|
|
|