# import pandas as pd # import os # import shutil # import subprocess # import multiprocessing # from tqdm import tqdm # from pathlib import Path # import numpy as np # from sklearn.model_selection import train_test_split # from vits2.text.symbols import symbols # import vits2.utils as utils import torch import json from phonemizer.backend import EspeakBackend from vits2.text import text_to_sequence import vits2.commons as commons # TODO: EDITED BY fathos82 # ADD tts/ with open('ms_pt_br_vits2.json', "r") as f: data = f.read() training_hparams_json= json.loads(data) # TODO: Edited by @fathos82: Unnecessary code, it causes a problem with pytorch from source dependency # from resemblyzer import VoiceEncoder # encoder = VoiceEncoder() backend = EspeakBackend('pt-br',preserve_punctuation=True,with_stress=True) def text_to_IPA(sentence): return backend.phonemize([sentence])[0].rstrip() def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm # TODO: EDITED BY fathos82 # WARN: Below is a list of functions that are not used in rose hocd, that is, useless. # def preprocess_tts_training(csv_path, output_folder,audio_column='wav_paths',text_column='texts',spk_ids_column='', pretrained_g=None, pretrained_d=None): # data = [] # read_df = pd.read_csv(csv_path) # os.makedirs(output_folder, exist_ok=True) # if os.path.exists(pretrained_g): # os.makedirs(output_folder + '/logs', exist_ok=True) # shutil.copy(pretrained_g, output_folder + '/logs/G_1.pth') # if os.path.exists(pretrained_d): # os.makedirs(output_folder + '/logs', exist_ok=True) # shutil.copy(pretrained_d, output_folder + '/logs/D_1.pth') # # audiopaths = read_df[audio_column].tolist() # new_audiopaths = convert_wavs(audiopaths,output_folder) # text = read_df[text_column].tolist() # if not spk_ids_column: # spk_ids = None # else: # spk_ids = read_df[spk_ids_column].tolist() # audiopaths_per_spk_id = get_audiopaths_per_spk_id(new_audiopaths, spk_ids) # generate_embeddings_for_training(audiopaths_per_spk_id ,output_folder) # generate_formatted_metadata(new_audiopaths,text,output_folder,spk_ids=spk_ids) # print('Preprocessing done!') # gr.Info("Preprocessing finished, you can start training!") # return None # # def get_audiopaths_per_spk_id(audiopaths, spk_ids= None): # audiopaths_per_spk_id = {} # if not spk_ids: # spk_ids = [0] * len(audiopaths) # for i, audiopath in enumerate(audiopaths): # audiopaths_per_spk_id[spk_ids[i]] = audiopaths_per_spk_id.get(spk_ids[i], []) + [audiopath] # return audiopaths_per_spk_id # # # def convert_wav(audiopath, basename, wavs_folder): # subprocess.Popen(f"sox {audiopath} -b 16 -r 22050 -c 1 norm -1 {wavs_folder}/{basename}", shell=True, stdout=subprocess.PIPE).stdout.read() # # def convert_wavs(audiopaths,output_folder): # print('converting wavs to 22050 Hz / 16 bit / mono...') # basenames = [os.path.basename(audiopath) for audiopath in audiopaths] # wavs_folder = output_folder + '/wavs' # os.makedirs(wavs_folder, exist_ok=True) # pool = multiprocessing.Pool() # results = [] # for audiopath, basename in tqdm(zip(audiopaths, basenames), total=len(audiopaths)): # result = pool.apply_async(convert_wav, (audiopath, basename, wavs_folder)) # results.append(result) # # pool.close() # pool.join() # # for result in results: # result.get() # return [f'{wavs_folder}/{os.path.basename(audiopath)}' for audiopath in audiopaths] # # def generate_embeddings_for_training(audiopaths_per_spk_id,output_folder): # print('generating speaker embeddings...') # speaker_embeddings = [] # for spk_id in audiopaths_per_spk_id.keys(): # audiopaths = audiopaths_per_spk_id[spk_id] # embeds = [] # for audiopath in tqdm(audiopaths[:50]): # fpath = Path(audiopath) # wav = preprocess_wav(fpath) # embeds.append(encoder.embed_utterance(wav)) # if len(embeds) > 1: # custom_embedding = torch.tensor(np.median(embeds,axis=0)) # else: # custom_embedding = torch.tensor(embeds[0]) # speaker_embeddings.append(torch.FloatTensor(custom_embedding)) # speaker_embeddings = torch.stack(speaker_embeddings) # torch.save(speaker_embeddings, output_folder + '/speaker_embeddings.pt') # return None # # def generate_formatted_metadata(audiopaths,text,output_folder, spk_ids = None): # print('converting csvs...') # # # with multiprocessing.Pool() as pool: # phonemes = list(tqdm(pool.imap(text_to_IPA, text), total=len(text))) # df = pd.DataFrame() # df['paths'] = audiopaths # if not spk_ids: # df['spk_id'] = [0] * len(audiopaths) # else: # df['spk_id'] = spk_ids # df['phonemes'] = phonemes # train_df, test_df = train_test_split(df, test_size=0.05, random_state=42) # test_df, val_df = train_test_split(test_df, test_size=0.9, random_state=42) # train_df.to_csv(output_folder + '/filelist_train.txt', index=False, header=False, sep='|') # test_df.to_csv(output_folder + '/filelist_test.txt', index=False, header=False, sep='|') # val_df.to_csv(output_folder + '/filelist_val.txt', index=False, header=False, sep='|') # training_hparams_json['data']['training_files'] = output_folder + '/filelist_train.txt' # training_hparams_json['data']["validation_files"] = output_folder + '/filelist_val.txt' # training_hparams_json['model']['speaker_embeddings_path'] = output_folder + '/speaker_embeddings.pt' # json.dump(training_hparams_json, open(output_folder + '/config.json', 'w')) # # return None #