import gradio as gr import os import json import math import torch from torch import nn from torch.nn import functional as F import librosa import argparse import librosa.display import matplotlib.pyplot as plt import commons import utils from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate #from model_old_mel_style import SynthesizerTrn from models_mel_style import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence from mel_processing import spectrogram_torch, spec_to_mel_torch from scipy.io.wavfile import write # Thư mục chứa các file wav AUDIO_DIR = "wav/wav_1" def list_wav_files(): return [f for f in os.listdir(AUDIO_DIR) if f.endswith(".wav")] # Trả về đường dẫn file wav được chọn def get_audio_file(file_name): file_path = os.path.join(AUDIO_DIR, file_name) return file_path def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm # Tạo giọng nói bằng mô hình def generate_voice(prompt_text, ref_audio_filename): import argparse class Args: checkpoint_path = "logs/large_audio/G_504000.pth" config = "configs/vn_base.json" save_path = "infer_result/" ref_audio = os.path.join("wav/wav_1", ref_audio_filename) text = prompt_text args = Args() hps = utils.get_hparams_from_file(args.config) net_g = SynthesizerTrn( len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=0, **hps.model ) _ = net_g.eval() _ = utils.load_checkpoint(args.checkpoint_path, net_g, None) audio, _ = librosa.load(args.ref_audio, sr=hps.data.sampling_rate) audio = torch.from_numpy(audio).unsqueeze(0) spec = spectrogram_torch(audio, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False) spec = torch.squeeze(spec, 0) mel = spec_to_mel_torch(spec, hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax) stn_tst = get_text(args.text, hps) x_tst = stn_tst.unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) sid = torch.LongTensor([4]) with torch.no_grad(): audio_gen = net_g.infer(x_tst, x_tst_lengths, mel.unsqueeze(0), sid=None, noise_scale=0.1, noise_scale_w=0.1, length_scale=1.1)[0][0, 0].data.cpu().float().numpy() os.makedirs(args.save_path, exist_ok=True) output_file = os.path.join(args.save_path, f'test_{str(len(os.listdir(args.save_path)))}.wav') write(output_file, hps.data.sampling_rate, audio_gen) return output_file # with gr.Blocks() as demo: # gr.Markdown("