import gradio as gr import os import json import math import torch from torch import nn from torch.nn import functional as F import librosa import argparse import librosa.display import matplotlib.pyplot as plt import commons import utils from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate #from model_old_mel_style import SynthesizerTrn from models_mel_style import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence from mel_processing import spectrogram_torch, spec_to_mel_torch from scipy.io.wavfile import write # Thư mục chứa các file wav AUDIO_DIR = "wav/wav_1" def list_wav_files(): return [f for f in os.listdir(AUDIO_DIR) if f.endswith(".wav")] # Trả về đường dẫn file wav được chọn def get_audio_file(file_name): file_path = os.path.join(AUDIO_DIR, file_name) return file_path def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm # Tạo giọng nói bằng mô hình def generate_voice(prompt_text, ref_audio_filename): import argparse class Args: checkpoint_path = "logs/large_audio/G_504000.pth" config = "configs/vn_base.json" save_path = "infer_result/" ref_audio = os.path.join("wav/wav_1", ref_audio_filename) text = prompt_text args = Args() hps = utils.get_hparams_from_file(args.config) net_g = SynthesizerTrn( len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=0, **hps.model ) _ = net_g.eval() _ = utils.load_checkpoint(args.checkpoint_path, net_g, None) audio, _ = librosa.load(args.ref_audio, sr=hps.data.sampling_rate) audio = torch.from_numpy(audio).unsqueeze(0) spec = spectrogram_torch(audio, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False) spec = torch.squeeze(spec, 0) mel = spec_to_mel_torch(spec, hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax) stn_tst = get_text(args.text, hps) x_tst = stn_tst.unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) sid = torch.LongTensor([4]) with torch.no_grad(): audio_gen = net_g.infer(x_tst, x_tst_lengths, mel.unsqueeze(0), sid=None, noise_scale=0.1, noise_scale_w=0.1, length_scale=1.1)[0][0, 0].data.cpu().float().numpy() os.makedirs(args.save_path, exist_ok=True) output_file = os.path.join(args.save_path, f'test_{str(len(os.listdir(args.save_path)))}.wav') write(output_file, hps.data.sampling_rate, audio_gen) return output_file # with gr.Blocks() as demo: # gr.Markdown("
#

Demo Model Text to Speech

") # prompt = gr.Textbox(label="Prompt", placeholder="Type somethrgs.ing here...") # wav_files = sorted(list_wav_files()) # if not wav_files: # gr.Markdown("⚠️ Không tìm thấy file .wav nào trong thư mục!") # gr.Markdown("## 🎧 Chọn và nghe file âm thanh gốc") # with gr.Row(): # file_dropdown = gr.Dropdown(choices=wav_files, label="Chọn file WAV") # audio_output = gr.Audio(type="filepath", label="Nghe tại đây") # file_dropdown.change(fn=get_audio_file, inputs=file_dropdown, outputs=audio_output) # generate_button = gr.Button("Generate Voice") # generated_audio_output = gr.Audio(type="filepath", label="🔊 Kết quả sinh giọng nói") # generate_button.click(fn=generate_voice, inputs=[prompt, file_dropdown], outputs=generated_audio_output) with gr.Blocks() as demo: gr.Markdown("
#

Demo Model Text to Speech

") prompt = gr.Textbox(label="Prompt", placeholder="Type something here...") gr.Markdown("## 🎧 Chọn hoặc ghi âm giọng nói tham chiếu") with gr.Tab("📁 Chọn từ file"): wav_files = sorted(list_wav_files()) file_dropdown = gr.Dropdown(choices=wav_files, label="Chọn file WAV có sẵn") audio_output = gr.Audio(type="filepath", label="Nghe tại đây") file_dropdown.change(fn=get_audio_file, inputs=file_dropdown, outputs=audio_output) with gr.Tab("🎙️ Ghi âm mới"): recorded_audio = gr.Audio(label="Ghi âm hoặc chọn file", type="filepath") # Nút sinh giọng nói generate_button = gr.Button("Generate Voice") generated_audio_output = gr.Audio(type="filepath", label="🔊 Kết quả sinh giọng nói") def process_inputs(prompt_text, file_choice, recorded_path): # Nếu người dùng có file ghi âm -> lưu tạm và dùng if recorded_path is not None: filename = f"user_recording_{len(os.listdir(AUDIO_DIR))}.wav" saved_path = os.path.join(AUDIO_DIR, filename) os.rename(recorded_path, saved_path) ref_file = filename elif file_choice: ref_file = file_choice else: raise gr.Error("Bạn cần chọn hoặc ghi âm một file giọng nói.") return generate_voice(prompt_text, ref_file) generate_button.click( fn=process_inputs, inputs=[prompt, file_dropdown, recorded_audio], outputs=generated_audio_output ) demo.launch()