File size: 9,085 Bytes
cd8454d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | # -*- coding: utf-8 -*-
# This project combines the TN and G2P functions of https://github.com/RVC-Boss/GPT-SoVITS and https://github.com/wenet-e2e/WeTextProcessing
# Huawei Technologies Co., Ltd. (authors: Xiao Chen)
from text.cleaner import clean_text
import LangSegment
from text import symbols as symbols_v1
from TN_processors import punct_normalization, MultilingualTN, alphabet_normalization
import sys
class BaseG2P:
def __init__(self):
self.sil_symbol = '[SIL]'
self.comma_symbol = '[CM]' # ,
self.peroid_symbol = '[PD]' # .
self.question_symbol = '[QN]' # ?
self.exclamation_symbol = '[EX]' # !
self.punct_to_symbol = {',': self.comma_symbol, '.': self.peroid_symbol,
'!': self.exclamation_symbol, '?': self.question_symbol}
self.er_mapping = {'er1': ('e1', 'rr'), 'er2': ('e2', 'rr'), 'er3': ('e3', 'rr'), 'er4': ('e4', 'rr'),
'er5': ('e5', 'rr'), 'r5': ('e5', 'rr')}
pass
def replace_punct_with_symbol(self, phone_list):
rep_list = []
for ph in phone_list:
if ph in self.punct_to_symbol:
rep_list.append(self.punct_to_symbol[ph])
else:
rep_list.append(ph)
return rep_list
class MultilingualG2P(BaseG2P):
def __init__(self, module="wenet", remove_interjections=False, remove_erhua=True):
BaseG2P.__init__(self)
self.tn_module = module
self.language_module_map = {"zh": "chinese", "en": "english"}
self.version = "v1"
self.output_eng_word_boundary = False
self.output_chn_word_boundary = False
if self.tn_module == "wenet":
self.tn_wenet = MultilingualTN("wenet", remove_interjections, remove_erhua)
return
def set_output_eng_word_boundary(self, output_eng_word_boundary):
self.output_eng_word_boundary = output_eng_word_boundary
def set_output_chn_word_boundary(self, output_chn_word_boundary):
self.output_chn_word_boundary = output_chn_word_boundary
def g2p_for_norm_text(self, norm_text, language):
symbols = symbols_v1.symbols
if(language not in self.language_module_map):
language="zh"
text=" "
language_module = __import__("text."+self.language_module_map[language],fromlist=[self.language_module_map[language]])
if language == "zh":##########
phones, word2ph = language_module.g2p(norm_text)
assert len(phones) == sum(word2ph)
assert len(norm_text) == len(word2ph)
elif language == "en":
if self.output_eng_word_boundary:
phones = language_module.g2p_with_boundary(norm_text)
else:
phones = language_module.g2p(norm_text)
# if len(phones) < 4:
# phones = [','] + phones
word2ph = None
else:
phones = language_module.g2p(norm_text)
word2ph = None
phones = ['UNK' if ph not in symbols else ph for ph in phones]
return phones, word2ph
def text_normalization_and_g2p(self, text, language, with_lang_prefix=False, normalize_punct=False):
'''
language in {en, zh}, if language == "zh", this method supports mixture of English and Chinese input. if language == "en", this method only supports English input.
'''
if normalize_punct:
text = punct_normalization(text)
# print('norm text: ' + text)
text = alphabet_normalization(text)
text = text.lower()
if language in {"en"}:
language = language.replace("all_", "")
if language == "en":
LangSegment.setfilters(["en"])
formattext = " ".join(tmp["text"]
for tmp in LangSegment.getTexts(text))
else:
# 因无法区别中日韩文汉字,以用户输入为准
formattext = text
while " " in formattext:
formattext = formattext.replace(" ", " ")
if self.tn_module == "baidu":
phones, word2ph, norm_text = clean_text(
formattext, language, self.version)
all_norm_text = norm_text
else:
norm_formattext = self.tn_wenet.normalize_segment(formattext, language, normalize_punct)
phones, word2ph = self.g2p_for_norm_text(norm_formattext, language)
all_norm_text = norm_formattext
elif language in {"zh", "auto"}:
textlist = []
langlist = []
LangSegment.setfilters(["en", "zh", "ja", "ko"])
# priority_lang = LangSegment.getfilters()
if language == "auto":
for tmp in LangSegment.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
else:
for tmp in LangSegment.getTexts(text):
if tmp["lang"] == "en":
langlist.append(tmp["lang"])
else:
# 因无法区别中日韩文汉字,以用户输入为准
langlist.append(language)
textlist.append(tmp["text"])
#fix consecutive segment of same language
mergelist = []
for idx in range(len(textlist)):
if idx > 0 and langlist[idx - 1] == langlist[idx]:
mergelist.append(1)
else:
mergelist.append(0)
merged_textlist = []
merged_langlist = []
for idx in range(len(mergelist)):
if mergelist[idx] == 0:
merged_textlist.append(textlist[idx])
merged_langlist.append(langlist[idx])
else:
merged_textlist[-1] += " " + textlist[idx]
textlist = merged_textlist
langlist = merged_langlist
assert len(textlist) == len(langlist)
# print(textlist)
# print(langlist)
phones_list = []
norm_text_list = []
for i in range(len(textlist)):
lang = langlist[i]
if self.tn_module == "wenet":
norm_text = self.tn_wenet.normalize_segment(textlist[i], lang, normalize_punct)
phones, word2ph = self.g2p_for_norm_text(norm_text, lang)
else:
phones, word2ph, norm_text = clean_text(
textlist[i], lang, self.version)
# lang prefix is only for mix language
if with_lang_prefix:
phones_with_lang = []
for ph in phones:
if ph[0].isalpha():
phones_with_lang.append(lang + '_' + ph)
else:
phones_with_lang.append(ph)
phones_list.append(phones_with_lang)
else:
phones_list.append(phones)
norm_text_list.append(norm_text)
phones = sum(phones_list, [])
all_norm_text = ' '.join(norm_text_list)
# if not final and len(phones) < 6:
# return text_normalization_and_g2p("." + text,language,version,final=True)
if normalize_punct:
phones = self.replace_punct_with_symbol(phones)
return phones, all_norm_text
if __name__ == '__main__':
'''
Testing functions
'''
# text = '1983年2月,旅行了2天的儿童和长翅膀的女孩儿:“︘菜单修订后有鱼香肉丝儿、『王道椒香鸡腿〕和川蜀鸡翅?……”it\'s a test 112.王会计会计算机。which had been in force since 1760.调查员决定调节调查的难度。Article VI, Qing government would be charged an annual interest rate of 5% for the money.√2和π是不是无理数?'
# text = '马打兰王国(732-1006),是8世纪到10世纪期间,存在于中爪哇的一个印度化王国。'
language = 'zh' # zh means the model treats all non-English as Chinese, en means the model treats all langauge as English.
mG2P = MultilingualG2P("wenet", remove_interjections=False, remove_erhua=False) # 'baidu' or 'wenet'
mG2P.set_output_eng_word_boundary(True)
sys.stdout.write("Input: ")
sys.stdout.flush()
for line in sys.stdin:
if line.strip() == "exit()":
exit()
if len(line.strip()) <= 0:
sys.stdout.write("Input: ")
sys.stdout.flush()
continue
phones, norm_text = mG2P.text_normalization_and_g2p(
line.strip(), language, with_lang_prefix=True, normalize_punct=True)
sys.stdout.write("Norm Text: " + norm_text + "\n")
sys.stdout.write("phonemes: " + " ".join(phones) + "\n")
sys.stdout.write("Input: ")
sys.stdout.flush()
|