# -*- coding: utf-8 -*- # This project combines the TN and G2P functions of https://github.com/RVC-Boss/GPT-SoVITS and https://github.com/wenet-e2e/WeTextProcessing # Huawei Technologies Co., Ltd. (authors: Xiao Chen) from re import M from tn.chinese.normalizer import Normalizer as ZhNormalizer from tn.english.normalizer import Normalizer as EnNormalizer import LangSegment from text import symbols as symbols_v1 from text.chinese import replace_consecutive_punctuation, replace_punctuation_with_en, replace_punctuation import sys TN_MODULES = {'baidu', 'wenet'} PUNCT_NORMALIZE = {',': ',', '。': '.', '、': ',', ';': ',', '‘': ',', '【': ',', '】': ',', '·': ',', '《': ',', '》': ',', '?': '?', ':': ',', '“': ',', '”': ',', '!': '!', '…': ',', ')': ',', '(': ',', '〃': ',', '〈': ',', '〉': ',', '「': ',', '」': ',', '『': ',', '』': ',', '〖': ',', '〔': ',', '〕': ',', '〗': ',', '〞': ',', '〝': ',', '﹚': ',', '﹙': ',', '﹛': ',', '﹜': ',', '﹝': ',', '﹞': ',', '!': '!', '"': ',', ''': ',', '︐': ',', '︑': ',', '︒': ',', '︔': ',', '︓': ',', '︕': '!', '︖': '?', '︗': ',', '︘': ',', '︙': ',', '︰': ',', '︱': ',', '︳': ',', '︵': ',', '︶': ',', '︷': ',', '︸': ',', '︹': ',', '︺': ',', '︻': ',', '︼': ',', '︽': ',', '︾': ',', '︿': ',', '﹀': ',', '﹁': ',', '﹂': ',', '﹃': ',', '﹄': ',', ';': ',', '[': ',', ']': ',', '`': ',', ':': ',', '"': ',', '{': ',', '}': ',', '~': ',', ')': ',', '(': ',', '_': '"', '’': '\'', '^': ',', '﹔': ','} ALPHABET_NORM = {'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x', 'y': 'y', 'z': 'z', 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z'} def punct_normalization(sent): output_sent = '' for idx in range(len(sent)): if sent[idx] in PUNCT_NORMALIZE: output_sent += PUNCT_NORMALIZE[sent[idx]] else: output_sent += sent[idx] return output_sent def alphabet_normalization(sent): output_sent = '' for idx in range(len(sent)): if sent[idx] in ALPHABET_NORM: output_sent += ALPHABET_NORM[sent[idx]] else: output_sent += sent[idx] return output_sent class MultilingualTN(): def __init__(self, module="wenet", remove_interjections=False, remove_erhua=True): self.tn_module = module self.language_module_map = {"zh": "chinese", "en": "english"} self.tn_implements = dict() if self.tn_module in TN_MODULES: if self.tn_module == "baidu": for l, m in self.language_module_map.items(): self.tn_implements[l] = __import__("text."+m, fromlist=[m]) else: for l, m in self.language_module_map.items(): if l == "en": self.tn_implements[l] = EnNormalizer( overwrite_cache=True) else: self.tn_implements[l] = ZhNormalizer( remove_erhua=remove_erhua, remove_interjections=remove_interjections, overwrite_cache=True) pass def _do_tn(self, text, language="zh"): norm_text = "" if language in self.language_module_map: module = self.tn_implements[language] else: module = self.tn_implements["zh"] if self.tn_module == "baidu": if hasattr(module, "text_normalize"): norm_text = module.text_normalize(text) else: norm_text = text elif self.tn_module == "wenet": if hasattr(module, "normalize"): norm_text = module.normalize(text) else: norm_text = text else: norm_text = text if language == "zh": norm_text = replace_punctuation_with_en(norm_text) norm_text = replace_consecutive_punctuation(norm_text) while " " in norm_text: norm_text = norm_text.replace(" ", " ") return norm_text def normalize_segment(self, text, language, normalize_punct=False): if normalize_punct: text = punct_normalization(text) text = alphabet_normalization(text) text = text.lower() norm_text = self._do_tn(text, language) return norm_text def normalize(self, text, language, normalize_punct=False): if normalize_punct: text = punct_normalization(text) text = alphabet_normalization(text) text = text.lower() textlist = [] langlist = [] LangSegment.setfilters(["zh", "ja", "en", "ko"]) if language == "auto": for tmp in LangSegment.getTexts(text): langlist.append(tmp["lang"]) textlist.append(tmp["text"]) else: for tmp in LangSegment.getTexts(text): if tmp["lang"] == "en": langlist.append(tmp["lang"]) else: # 因无法区别中日韩文汉字,以用户输入为准 langlist.append(language) textlist.append(tmp["text"]) # print(textlist) # print(langlist) phones_list = [] norm_text_list = [] for i in range(len(textlist)): lang = langlist[i] norm_text = self._do_tn(textlist[i], lang) norm_text_list.append(norm_text) norm_text = ''.join(norm_text_list) return norm_text if __name__ == '__main__': ''' Testing functions ''' # text = '1983年2月,旅行了2天的儿童和长翅膀的女孩儿:“︘菜单修订后有鱼香肉丝儿、『王道椒香鸡腿〕和川蜀鸡翅?……”it\'s a test.王会计会计算机。which had been in force since 1760.调查员决定调节调查的难度。Article VI, Qing government would be charged an annual interest rate of 5% for the money' # text = 'Just Do It系列广告是哪个品牌的?从以下生物中选择出属于“植物”类的生物:\n\nA. 人 \nB. 杨树 \nC. 猫 \nD. 月季花 \nE. 细菌\nF. 真菌\nG. 灌木\n80/20法则是什么?NHTSA将自动驾驶分为多少个级别?√2和π是不是无理数?' language = 'zh' TN = MultilingualTN() sys.stderr.write("Input: ") for line in sys.stdin: if line.strip() == "exit()": exit() if len(line.strip()) <= 0: sys.stderr.write("Input: ") continue sys.stdout.write("{}\n".format(TN.normalize( line.strip(), language="zh", normalize_punct=True))) sys.stderr.write("Input: ")