Novo / preprocess.py
fathos82's picture
Upload 117 files
3e7a652 verified
# import pandas as pd
# import os
# import shutil
# import subprocess
# import multiprocessing
# from tqdm import tqdm
# from pathlib import Path
# import numpy as np
# from sklearn.model_selection import train_test_split
# from vits2.text.symbols import symbols
# import vits2.utils as utils
import torch
import json
from phonemizer.backend import EspeakBackend
from vits2.text import text_to_sequence
import vits2.commons as commons
# TODO: EDITED BY fathos82
# ADD tts/
with open('ms_pt_br_vits2.json', "r") as f:
data = f.read()
training_hparams_json= json.loads(data)
# TODO: Edited by @fathos82: Unnecessary code, it causes a problem with pytorch from source dependency
# from resemblyzer import VoiceEncoder
# encoder = VoiceEncoder()
backend = EspeakBackend('pt-br',preserve_punctuation=True,with_stress=True)
def text_to_IPA(sentence):
return backend.phonemize([sentence])[0].rstrip()
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
# TODO: EDITED BY fathos82
# WARN: Below is a list of functions that are not used in rose hocd, that is, useless.
# def preprocess_tts_training(csv_path, output_folder,audio_column='wav_paths',text_column='texts',spk_ids_column='', pretrained_g=None, pretrained_d=None):
# data = []
# read_df = pd.read_csv(csv_path)
# os.makedirs(output_folder, exist_ok=True)
# if os.path.exists(pretrained_g):
# os.makedirs(output_folder + '/logs', exist_ok=True)
# shutil.copy(pretrained_g, output_folder + '/logs/G_1.pth')
# if os.path.exists(pretrained_d):
# os.makedirs(output_folder + '/logs', exist_ok=True)
# shutil.copy(pretrained_d, output_folder + '/logs/D_1.pth')
#
# audiopaths = read_df[audio_column].tolist()
# new_audiopaths = convert_wavs(audiopaths,output_folder)
# text = read_df[text_column].tolist()
# if not spk_ids_column:
# spk_ids = None
# else:
# spk_ids = read_df[spk_ids_column].tolist()
# audiopaths_per_spk_id = get_audiopaths_per_spk_id(new_audiopaths, spk_ids)
# generate_embeddings_for_training(audiopaths_per_spk_id ,output_folder)
# generate_formatted_metadata(new_audiopaths,text,output_folder,spk_ids=spk_ids)
# print('Preprocessing done!')
# gr.Info("Preprocessing finished, you can start training!")
# return None
#
# def get_audiopaths_per_spk_id(audiopaths, spk_ids= None):
# audiopaths_per_spk_id = {}
# if not spk_ids:
# spk_ids = [0] * len(audiopaths)
# for i, audiopath in enumerate(audiopaths):
# audiopaths_per_spk_id[spk_ids[i]] = audiopaths_per_spk_id.get(spk_ids[i], []) + [audiopath]
# return audiopaths_per_spk_id
#
#
# def convert_wav(audiopath, basename, wavs_folder):
# subprocess.Popen(f"sox {audiopath} -b 16 -r 22050 -c 1 norm -1 {wavs_folder}/{basename}", shell=True, stdout=subprocess.PIPE).stdout.read()
#
# def convert_wavs(audiopaths,output_folder):
# print('converting wavs to 22050 Hz / 16 bit / mono...')
# basenames = [os.path.basename(audiopath) for audiopath in audiopaths]
# wavs_folder = output_folder + '/wavs'
# os.makedirs(wavs_folder, exist_ok=True)
# pool = multiprocessing.Pool()
# results = []
# for audiopath, basename in tqdm(zip(audiopaths, basenames), total=len(audiopaths)):
# result = pool.apply_async(convert_wav, (audiopath, basename, wavs_folder))
# results.append(result)
#
# pool.close()
# pool.join()
#
# for result in results:
# result.get()
# return [f'{wavs_folder}/{os.path.basename(audiopath)}' for audiopath in audiopaths]
#
# def generate_embeddings_for_training(audiopaths_per_spk_id,output_folder):
# print('generating speaker embeddings...')
# speaker_embeddings = []
# for spk_id in audiopaths_per_spk_id.keys():
# audiopaths = audiopaths_per_spk_id[spk_id]
# embeds = []
# for audiopath in tqdm(audiopaths[:50]):
# fpath = Path(audiopath)
# wav = preprocess_wav(fpath)
# embeds.append(encoder.embed_utterance(wav))
# if len(embeds) > 1:
# custom_embedding = torch.tensor(np.median(embeds,axis=0))
# else:
# custom_embedding = torch.tensor(embeds[0])
# speaker_embeddings.append(torch.FloatTensor(custom_embedding))
# speaker_embeddings = torch.stack(speaker_embeddings)
# torch.save(speaker_embeddings, output_folder + '/speaker_embeddings.pt')
# return None
#
# def generate_formatted_metadata(audiopaths,text,output_folder, spk_ids = None):
# print('converting csvs...')
#
#
# with multiprocessing.Pool() as pool:
# phonemes = list(tqdm(pool.imap(text_to_IPA, text), total=len(text)))
# df = pd.DataFrame()
# df['paths'] = audiopaths
# if not spk_ids:
# df['spk_id'] = [0] * len(audiopaths)
# else:
# df['spk_id'] = spk_ids
# df['phonemes'] = phonemes
# train_df, test_df = train_test_split(df, test_size=0.05, random_state=42)
# test_df, val_df = train_test_split(test_df, test_size=0.9, random_state=42)
# train_df.to_csv(output_folder + '/filelist_train.txt', index=False, header=False, sep='|')
# test_df.to_csv(output_folder + '/filelist_test.txt', index=False, header=False, sep='|')
# val_df.to_csv(output_folder + '/filelist_val.txt', index=False, header=False, sep='|')
# training_hparams_json['data']['training_files'] = output_folder + '/filelist_train.txt'
# training_hparams_json['data']["validation_files"] = output_folder + '/filelist_val.txt'
# training_hparams_json['model']['speaker_embeddings_path'] = output_folder + '/speaker_embeddings.pt'
# json.dump(training_hparams_json, open(output_folder + '/config.json', 'w'))
#
# return None
#