| | |
| |
|
| | |
| | |
| |
|
| | from re import M |
| | from tn.chinese.normalizer import Normalizer as ZhNormalizer |
| | from tn.english.normalizer import Normalizer as EnNormalizer |
| | import LangSegment |
| | from text import symbols as symbols_v1 |
| | from text.chinese import replace_consecutive_punctuation, replace_punctuation_with_en, replace_punctuation |
| | import sys |
| |
|
| |
|
| | TN_MODULES = {'baidu', 'wenet'} |
| |
|
| | PUNCT_NORMALIZE = {',': ',', '。': '.', '、': ',', ';': ',', '‘': ',', '【': ',', '】': ',', '·': ',', '《': ',', '》': ',', '?': '?', |
| | ':': ',', '“': ',', '”': ',', '!': '!', '…': ',', ')': ',', '(': ',', '〃': ',', '〈': ',', '〉': ',', |
| | '「': ',', '」': ',', '『': ',', '』': ',', '〖': ',', '〔': ',', '〕': ',', '〗': ',', '〞': ',', '〝': ',', '﹚': ',', |
| | '﹙': ',', '﹛': ',', '﹜': ',', '﹝': ',', '﹞': ',', '!': '!', '"': ',', ''': ',', '︐': ',', '︑': ',', '︒': ',', |
| | '︔': ',', '︓': ',', '︕': '!', '︖': '?', '︗': ',', '︘': ',', '︙': ',', '︰': ',', '︱': ',', '︳': ',', '︵': ',', |
| | '︶': ',', '︷': ',', '︸': ',', '︹': ',', '︺': ',', '︻': ',', '︼': ',', '︽': ',', '︾': ',', '︿': ',', '﹀': ',', |
| | '﹁': ',', '﹂': ',', '﹃': ',', '﹄': ',', ';': ',', '[': ',', ']': ',', '`': ',', ':': ',', '"': ',', |
| | '{': ',', '}': ',', '~': ',', ')': ',', '(': ',', '_': '"', '’': '\'', '^': ',', '﹔': ','} |
| |
|
| | ALPHABET_NORM = {'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', |
| | 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x', 'y': 'y', 'z': 'z', |
| | 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', |
| | 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z'} |
| |
|
| |
|
| | def punct_normalization(sent): |
| | output_sent = '' |
| | for idx in range(len(sent)): |
| | if sent[idx] in PUNCT_NORMALIZE: |
| | output_sent += PUNCT_NORMALIZE[sent[idx]] |
| | else: |
| | output_sent += sent[idx] |
| | return output_sent |
| |
|
| |
|
| | def alphabet_normalization(sent): |
| | output_sent = '' |
| | for idx in range(len(sent)): |
| | if sent[idx] in ALPHABET_NORM: |
| | output_sent += ALPHABET_NORM[sent[idx]] |
| | else: |
| | output_sent += sent[idx] |
| | return output_sent |
| |
|
| |
|
| | class MultilingualTN(): |
| |
|
| | def __init__(self, module="wenet", remove_interjections=False, remove_erhua=True): |
| | self.tn_module = module |
| | self.language_module_map = {"zh": "chinese", "en": "english"} |
| | self.tn_implements = dict() |
| | if self.tn_module in TN_MODULES: |
| | if self.tn_module == "baidu": |
| | for l, m in self.language_module_map.items(): |
| | self.tn_implements[l] = __import__("text."+m, fromlist=[m]) |
| | else: |
| | for l, m in self.language_module_map.items(): |
| | if l == "en": |
| | self.tn_implements[l] = EnNormalizer( |
| | overwrite_cache=True) |
| | else: |
| | self.tn_implements[l] = ZhNormalizer( |
| | remove_erhua=remove_erhua, remove_interjections=remove_interjections, overwrite_cache=True) |
| | pass |
| |
|
| | def _do_tn(self, text, language="zh"): |
| | norm_text = "" |
| | if language in self.language_module_map: |
| | module = self.tn_implements[language] |
| | else: |
| | module = self.tn_implements["zh"] |
| | if self.tn_module == "baidu": |
| | if hasattr(module, "text_normalize"): |
| | norm_text = module.text_normalize(text) |
| | else: |
| | norm_text = text |
| | elif self.tn_module == "wenet": |
| | if hasattr(module, "normalize"): |
| | norm_text = module.normalize(text) |
| | else: |
| | norm_text = text |
| | else: |
| | norm_text = text |
| |
|
| | if language == "zh": |
| | norm_text = replace_punctuation_with_en(norm_text) |
| | norm_text = replace_consecutive_punctuation(norm_text) |
| | while " " in norm_text: |
| | norm_text = norm_text.replace(" ", " ") |
| | return norm_text |
| |
|
| | def normalize_segment(self, text, language, normalize_punct=False): |
| | if normalize_punct: |
| | text = punct_normalization(text) |
| | |
| | text = alphabet_normalization(text) |
| | text = text.lower() |
| |
|
| | norm_text = self._do_tn(text, language) |
| | return norm_text |
| |
|
| | def normalize(self, text, language, normalize_punct=False): |
| | if normalize_punct: |
| | text = punct_normalization(text) |
| | |
| | text = alphabet_normalization(text) |
| | text = text.lower() |
| |
|
| | textlist = [] |
| | langlist = [] |
| | LangSegment.setfilters(["zh", "ja", "en", "ko"]) |
| | if language == "auto": |
| | for tmp in LangSegment.getTexts(text): |
| | langlist.append(tmp["lang"]) |
| | textlist.append(tmp["text"]) |
| | else: |
| | for tmp in LangSegment.getTexts(text): |
| | if tmp["lang"] == "en": |
| | langlist.append(tmp["lang"]) |
| | else: |
| | |
| | langlist.append(language) |
| | textlist.append(tmp["text"]) |
| | |
| | |
| | phones_list = [] |
| | norm_text_list = [] |
| | for i in range(len(textlist)): |
| | lang = langlist[i] |
| | norm_text = self._do_tn(textlist[i], lang) |
| | norm_text_list.append(norm_text) |
| | norm_text = ''.join(norm_text_list) |
| | return norm_text |
| |
|
| |
|
| | if __name__ == '__main__': |
| | ''' |
| | Testing functions |
| | ''' |
| | |
| | |
| | language = 'zh' |
| | TN = MultilingualTN() |
| | sys.stderr.write("Input: ") |
| | for line in sys.stdin: |
| | if line.strip() == "exit()": |
| | exit() |
| | if len(line.strip()) <= 0: |
| | sys.stderr.write("Input: ") |
| | continue |
| | sys.stdout.write("{}\n".format(TN.normalize( |
| | line.strip(), language="zh", normalize_punct=True))) |
| | sys.stderr.write("Input: ") |
| |
|