File size: 5,939 Bytes
3e7a652
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# import pandas as pd
# import os
# import shutil
# import subprocess
# import multiprocessing
# from tqdm import tqdm
# from pathlib import Path
# import numpy as np
# from sklearn.model_selection import train_test_split
# from vits2.text.symbols import symbols
# import vits2.utils as utils

import torch
import json
from phonemizer.backend import EspeakBackend

from vits2.text import text_to_sequence
import vits2.commons as commons

# TODO: EDITED BY fathos82
#  ADD  tts/
with open('ms_pt_br_vits2.json', "r") as f:
    data = f.read()
training_hparams_json= json.loads(data)

# TODO:  Edited by @fathos82: Unnecessary code, it causes a problem with pytorch from source dependency
# from resemblyzer import VoiceEncoder
# encoder = VoiceEncoder()

backend = EspeakBackend('pt-br',preserve_punctuation=True,with_stress=True)

def text_to_IPA(sentence):
    return backend.phonemize([sentence])[0].rstrip()
def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

# TODO: EDITED BY fathos82
#  WARN: Below is a list of functions that are not used in rose hocd, that is, useless.

# def preprocess_tts_training(csv_path, output_folder,audio_column='wav_paths',text_column='texts',spk_ids_column='', pretrained_g=None, pretrained_d=None):
#     data = []
#     read_df = pd.read_csv(csv_path)
#     os.makedirs(output_folder, exist_ok=True)
#     if os.path.exists(pretrained_g):
#         os.makedirs(output_folder + '/logs', exist_ok=True)
#         shutil.copy(pretrained_g, output_folder + '/logs/G_1.pth')
#     if os.path.exists(pretrained_d):
#         os.makedirs(output_folder + '/logs', exist_ok=True)
#         shutil.copy(pretrained_d, output_folder + '/logs/D_1.pth')
#
#     audiopaths = read_df[audio_column].tolist()
#     new_audiopaths = convert_wavs(audiopaths,output_folder)
#     text = read_df[text_column].tolist()
#     if not spk_ids_column:
#         spk_ids = None
#     else:
#         spk_ids = read_df[spk_ids_column].tolist()
#     audiopaths_per_spk_id = get_audiopaths_per_spk_id(new_audiopaths, spk_ids)
#     generate_embeddings_for_training(audiopaths_per_spk_id ,output_folder)
#     generate_formatted_metadata(new_audiopaths,text,output_folder,spk_ids=spk_ids)
#     print('Preprocessing done!')
#     gr.Info("Preprocessing finished, you can start training!")
#     return None
#
# def get_audiopaths_per_spk_id(audiopaths, spk_ids= None):
#     audiopaths_per_spk_id = {}
#     if not spk_ids:
#         spk_ids = [0] * len(audiopaths)
#     for i, audiopath in enumerate(audiopaths):
#         audiopaths_per_spk_id[spk_ids[i]] = audiopaths_per_spk_id.get(spk_ids[i], []) + [audiopath]
#     return audiopaths_per_spk_id
#
#
# def convert_wav(audiopath, basename, wavs_folder):
#         subprocess.Popen(f"sox {audiopath} -b 16 -r 22050 -c 1 norm -1 {wavs_folder}/{basename}", shell=True, stdout=subprocess.PIPE).stdout.read()
#
# def convert_wavs(audiopaths,output_folder):
#     print('converting wavs to 22050 Hz / 16 bit / mono...')
#     basenames = [os.path.basename(audiopath) for audiopath in audiopaths]
#     wavs_folder = output_folder + '/wavs'
#     os.makedirs(wavs_folder, exist_ok=True)
#     pool = multiprocessing.Pool()
#     results = []
#     for audiopath, basename in tqdm(zip(audiopaths, basenames), total=len(audiopaths)):
#         result = pool.apply_async(convert_wav, (audiopath, basename, wavs_folder))
#         results.append(result)
#
#     pool.close()
#     pool.join()
#
#     for result in results:
#         result.get()
#     return [f'{wavs_folder}/{os.path.basename(audiopath)}' for audiopath in audiopaths]
#
# def generate_embeddings_for_training(audiopaths_per_spk_id,output_folder):
#     print('generating speaker embeddings...')
#     speaker_embeddings = []
#     for spk_id in audiopaths_per_spk_id.keys():
#         audiopaths = audiopaths_per_spk_id[spk_id]
#         embeds = []
#         for audiopath in tqdm(audiopaths[:50]):
#             fpath = Path(audiopath)
#             wav = preprocess_wav(fpath)
#             embeds.append(encoder.embed_utterance(wav))
#         if len(embeds) > 1:
#             custom_embedding = torch.tensor(np.median(embeds,axis=0))
#         else:
#             custom_embedding = torch.tensor(embeds[0])
#         speaker_embeddings.append(torch.FloatTensor(custom_embedding))
#     speaker_embeddings = torch.stack(speaker_embeddings)
#     torch.save(speaker_embeddings, output_folder + '/speaker_embeddings.pt')
#     return None
#
# def generate_formatted_metadata(audiopaths,text,output_folder, spk_ids = None):
#     print('converting csvs...')
#
#
#     with multiprocessing.Pool() as pool:
#         phonemes = list(tqdm(pool.imap(text_to_IPA, text), total=len(text)))
#     df = pd.DataFrame()
#     df['paths'] = audiopaths
#     if not spk_ids:
#         df['spk_id'] = [0] * len(audiopaths)
#     else:
#         df['spk_id'] = spk_ids
#     df['phonemes'] = phonemes
#     train_df, test_df = train_test_split(df, test_size=0.05, random_state=42)
#     test_df, val_df = train_test_split(test_df, test_size=0.9, random_state=42)
#     train_df.to_csv(output_folder + '/filelist_train.txt', index=False, header=False, sep='|')
#     test_df.to_csv(output_folder + '/filelist_test.txt', index=False, header=False, sep='|')
#     val_df.to_csv(output_folder + '/filelist_val.txt', index=False, header=False, sep='|')
#     training_hparams_json['data']['training_files'] = output_folder + '/filelist_train.txt'
#     training_hparams_json['data']["validation_files"] = output_folder + '/filelist_val.txt'
#     training_hparams_json['model']['speaker_embeddings_path'] = output_folder + '/speaker_embeddings.pt'
#     json.dump(training_hparams_json, open(output_folder + '/config.json', 'w'))
#
#     return None
#