Spaces:
Runtime error
Runtime error
| # import pandas as pd | |
| # import os | |
| # import shutil | |
| # import subprocess | |
| # import multiprocessing | |
| # from tqdm import tqdm | |
| # from pathlib import Path | |
| # import numpy as np | |
| # from sklearn.model_selection import train_test_split | |
| # from vits2.text.symbols import symbols | |
| # import vits2.utils as utils | |
| import torch | |
| import json | |
| from phonemizer.backend import EspeakBackend | |
| from vits2.text import text_to_sequence | |
| import vits2.commons as commons | |
| # TODO: EDITED BY fathos82 | |
| # ADD tts/ | |
| with open('ms_pt_br_vits2.json', "r") as f: | |
| data = f.read() | |
| training_hparams_json= json.loads(data) | |
| # TODO: Edited by @fathos82: Unnecessary code, it causes a problem with pytorch from source dependency | |
| # from resemblyzer import VoiceEncoder | |
| # encoder = VoiceEncoder() | |
| backend = EspeakBackend('pt-br',preserve_punctuation=True,with_stress=True) | |
| def text_to_IPA(sentence): | |
| return backend.phonemize([sentence])[0].rstrip() | |
| def get_text(text, hps): | |
| text_norm = text_to_sequence(text, hps.data.text_cleaners) | |
| if hps.data.add_blank: | |
| text_norm = commons.intersperse(text_norm, 0) | |
| text_norm = torch.LongTensor(text_norm) | |
| return text_norm | |
| # TODO: EDITED BY fathos82 | |
| # WARN: Below is a list of functions that are not used in rose hocd, that is, useless. | |
| # def preprocess_tts_training(csv_path, output_folder,audio_column='wav_paths',text_column='texts',spk_ids_column='', pretrained_g=None, pretrained_d=None): | |
| # data = [] | |
| # read_df = pd.read_csv(csv_path) | |
| # os.makedirs(output_folder, exist_ok=True) | |
| # if os.path.exists(pretrained_g): | |
| # os.makedirs(output_folder + '/logs', exist_ok=True) | |
| # shutil.copy(pretrained_g, output_folder + '/logs/G_1.pth') | |
| # if os.path.exists(pretrained_d): | |
| # os.makedirs(output_folder + '/logs', exist_ok=True) | |
| # shutil.copy(pretrained_d, output_folder + '/logs/D_1.pth') | |
| # | |
| # audiopaths = read_df[audio_column].tolist() | |
| # new_audiopaths = convert_wavs(audiopaths,output_folder) | |
| # text = read_df[text_column].tolist() | |
| # if not spk_ids_column: | |
| # spk_ids = None | |
| # else: | |
| # spk_ids = read_df[spk_ids_column].tolist() | |
| # audiopaths_per_spk_id = get_audiopaths_per_spk_id(new_audiopaths, spk_ids) | |
| # generate_embeddings_for_training(audiopaths_per_spk_id ,output_folder) | |
| # generate_formatted_metadata(new_audiopaths,text,output_folder,spk_ids=spk_ids) | |
| # print('Preprocessing done!') | |
| # gr.Info("Preprocessing finished, you can start training!") | |
| # return None | |
| # | |
| # def get_audiopaths_per_spk_id(audiopaths, spk_ids= None): | |
| # audiopaths_per_spk_id = {} | |
| # if not spk_ids: | |
| # spk_ids = [0] * len(audiopaths) | |
| # for i, audiopath in enumerate(audiopaths): | |
| # audiopaths_per_spk_id[spk_ids[i]] = audiopaths_per_spk_id.get(spk_ids[i], []) + [audiopath] | |
| # return audiopaths_per_spk_id | |
| # | |
| # | |
| # def convert_wav(audiopath, basename, wavs_folder): | |
| # subprocess.Popen(f"sox {audiopath} -b 16 -r 22050 -c 1 norm -1 {wavs_folder}/{basename}", shell=True, stdout=subprocess.PIPE).stdout.read() | |
| # | |
| # def convert_wavs(audiopaths,output_folder): | |
| # print('converting wavs to 22050 Hz / 16 bit / mono...') | |
| # basenames = [os.path.basename(audiopath) for audiopath in audiopaths] | |
| # wavs_folder = output_folder + '/wavs' | |
| # os.makedirs(wavs_folder, exist_ok=True) | |
| # pool = multiprocessing.Pool() | |
| # results = [] | |
| # for audiopath, basename in tqdm(zip(audiopaths, basenames), total=len(audiopaths)): | |
| # result = pool.apply_async(convert_wav, (audiopath, basename, wavs_folder)) | |
| # results.append(result) | |
| # | |
| # pool.close() | |
| # pool.join() | |
| # | |
| # for result in results: | |
| # result.get() | |
| # return [f'{wavs_folder}/{os.path.basename(audiopath)}' for audiopath in audiopaths] | |
| # | |
| # def generate_embeddings_for_training(audiopaths_per_spk_id,output_folder): | |
| # print('generating speaker embeddings...') | |
| # speaker_embeddings = [] | |
| # for spk_id in audiopaths_per_spk_id.keys(): | |
| # audiopaths = audiopaths_per_spk_id[spk_id] | |
| # embeds = [] | |
| # for audiopath in tqdm(audiopaths[:50]): | |
| # fpath = Path(audiopath) | |
| # wav = preprocess_wav(fpath) | |
| # embeds.append(encoder.embed_utterance(wav)) | |
| # if len(embeds) > 1: | |
| # custom_embedding = torch.tensor(np.median(embeds,axis=0)) | |
| # else: | |
| # custom_embedding = torch.tensor(embeds[0]) | |
| # speaker_embeddings.append(torch.FloatTensor(custom_embedding)) | |
| # speaker_embeddings = torch.stack(speaker_embeddings) | |
| # torch.save(speaker_embeddings, output_folder + '/speaker_embeddings.pt') | |
| # return None | |
| # | |
| # def generate_formatted_metadata(audiopaths,text,output_folder, spk_ids = None): | |
| # print('converting csvs...') | |
| # | |
| # | |
| # with multiprocessing.Pool() as pool: | |
| # phonemes = list(tqdm(pool.imap(text_to_IPA, text), total=len(text))) | |
| # df = pd.DataFrame() | |
| # df['paths'] = audiopaths | |
| # if not spk_ids: | |
| # df['spk_id'] = [0] * len(audiopaths) | |
| # else: | |
| # df['spk_id'] = spk_ids | |
| # df['phonemes'] = phonemes | |
| # train_df, test_df = train_test_split(df, test_size=0.05, random_state=42) | |
| # test_df, val_df = train_test_split(test_df, test_size=0.9, random_state=42) | |
| # train_df.to_csv(output_folder + '/filelist_train.txt', index=False, header=False, sep='|') | |
| # test_df.to_csv(output_folder + '/filelist_test.txt', index=False, header=False, sep='|') | |
| # val_df.to_csv(output_folder + '/filelist_val.txt', index=False, header=False, sep='|') | |
| # training_hparams_json['data']['training_files'] = output_folder + '/filelist_train.txt' | |
| # training_hparams_json['data']["validation_files"] = output_folder + '/filelist_val.txt' | |
| # training_hparams_json['model']['speaker_embeddings_path'] = output_folder + '/speaker_embeddings.pt' | |
| # json.dump(training_hparams_json, open(output_folder + '/config.json', 'w')) | |
| # | |
| # return None | |
| # | |