Spaces:
Runtime error
Runtime error
File size: 5,939 Bytes
3e7a652 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | # import pandas as pd
# import os
# import shutil
# import subprocess
# import multiprocessing
# from tqdm import tqdm
# from pathlib import Path
# import numpy as np
# from sklearn.model_selection import train_test_split
# from vits2.text.symbols import symbols
# import vits2.utils as utils
import torch
import json
from phonemizer.backend import EspeakBackend
from vits2.text import text_to_sequence
import vits2.commons as commons
# TODO: EDITED BY fathos82
# ADD tts/
with open('ms_pt_br_vits2.json', "r") as f:
data = f.read()
training_hparams_json= json.loads(data)
# TODO: Edited by @fathos82: Unnecessary code, it causes a problem with pytorch from source dependency
# from resemblyzer import VoiceEncoder
# encoder = VoiceEncoder()
backend = EspeakBackend('pt-br',preserve_punctuation=True,with_stress=True)
def text_to_IPA(sentence):
return backend.phonemize([sentence])[0].rstrip()
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
# TODO: EDITED BY fathos82
# WARN: Below is a list of functions that are not used in rose hocd, that is, useless.
# def preprocess_tts_training(csv_path, output_folder,audio_column='wav_paths',text_column='texts',spk_ids_column='', pretrained_g=None, pretrained_d=None):
# data = []
# read_df = pd.read_csv(csv_path)
# os.makedirs(output_folder, exist_ok=True)
# if os.path.exists(pretrained_g):
# os.makedirs(output_folder + '/logs', exist_ok=True)
# shutil.copy(pretrained_g, output_folder + '/logs/G_1.pth')
# if os.path.exists(pretrained_d):
# os.makedirs(output_folder + '/logs', exist_ok=True)
# shutil.copy(pretrained_d, output_folder + '/logs/D_1.pth')
#
# audiopaths = read_df[audio_column].tolist()
# new_audiopaths = convert_wavs(audiopaths,output_folder)
# text = read_df[text_column].tolist()
# if not spk_ids_column:
# spk_ids = None
# else:
# spk_ids = read_df[spk_ids_column].tolist()
# audiopaths_per_spk_id = get_audiopaths_per_spk_id(new_audiopaths, spk_ids)
# generate_embeddings_for_training(audiopaths_per_spk_id ,output_folder)
# generate_formatted_metadata(new_audiopaths,text,output_folder,spk_ids=spk_ids)
# print('Preprocessing done!')
# gr.Info("Preprocessing finished, you can start training!")
# return None
#
# def get_audiopaths_per_spk_id(audiopaths, spk_ids= None):
# audiopaths_per_spk_id = {}
# if not spk_ids:
# spk_ids = [0] * len(audiopaths)
# for i, audiopath in enumerate(audiopaths):
# audiopaths_per_spk_id[spk_ids[i]] = audiopaths_per_spk_id.get(spk_ids[i], []) + [audiopath]
# return audiopaths_per_spk_id
#
#
# def convert_wav(audiopath, basename, wavs_folder):
# subprocess.Popen(f"sox {audiopath} -b 16 -r 22050 -c 1 norm -1 {wavs_folder}/{basename}", shell=True, stdout=subprocess.PIPE).stdout.read()
#
# def convert_wavs(audiopaths,output_folder):
# print('converting wavs to 22050 Hz / 16 bit / mono...')
# basenames = [os.path.basename(audiopath) for audiopath in audiopaths]
# wavs_folder = output_folder + '/wavs'
# os.makedirs(wavs_folder, exist_ok=True)
# pool = multiprocessing.Pool()
# results = []
# for audiopath, basename in tqdm(zip(audiopaths, basenames), total=len(audiopaths)):
# result = pool.apply_async(convert_wav, (audiopath, basename, wavs_folder))
# results.append(result)
#
# pool.close()
# pool.join()
#
# for result in results:
# result.get()
# return [f'{wavs_folder}/{os.path.basename(audiopath)}' for audiopath in audiopaths]
#
# def generate_embeddings_for_training(audiopaths_per_spk_id,output_folder):
# print('generating speaker embeddings...')
# speaker_embeddings = []
# for spk_id in audiopaths_per_spk_id.keys():
# audiopaths = audiopaths_per_spk_id[spk_id]
# embeds = []
# for audiopath in tqdm(audiopaths[:50]):
# fpath = Path(audiopath)
# wav = preprocess_wav(fpath)
# embeds.append(encoder.embed_utterance(wav))
# if len(embeds) > 1:
# custom_embedding = torch.tensor(np.median(embeds,axis=0))
# else:
# custom_embedding = torch.tensor(embeds[0])
# speaker_embeddings.append(torch.FloatTensor(custom_embedding))
# speaker_embeddings = torch.stack(speaker_embeddings)
# torch.save(speaker_embeddings, output_folder + '/speaker_embeddings.pt')
# return None
#
# def generate_formatted_metadata(audiopaths,text,output_folder, spk_ids = None):
# print('converting csvs...')
#
#
# with multiprocessing.Pool() as pool:
# phonemes = list(tqdm(pool.imap(text_to_IPA, text), total=len(text)))
# df = pd.DataFrame()
# df['paths'] = audiopaths
# if not spk_ids:
# df['spk_id'] = [0] * len(audiopaths)
# else:
# df['spk_id'] = spk_ids
# df['phonemes'] = phonemes
# train_df, test_df = train_test_split(df, test_size=0.05, random_state=42)
# test_df, val_df = train_test_split(test_df, test_size=0.9, random_state=42)
# train_df.to_csv(output_folder + '/filelist_train.txt', index=False, header=False, sep='|')
# test_df.to_csv(output_folder + '/filelist_test.txt', index=False, header=False, sep='|')
# val_df.to_csv(output_folder + '/filelist_val.txt', index=False, header=False, sep='|')
# training_hparams_json['data']['training_files'] = output_folder + '/filelist_train.txt'
# training_hparams_json['data']["validation_files"] = output_folder + '/filelist_val.txt'
# training_hparams_json['model']['speaker_embeddings_path'] = output_folder + '/speaker_embeddings.pt'
# json.dump(training_hparams_json, open(output_folder + '/config.json', 'w'))
#
# return None
#
|