| from text import cleaned_text_to_sequence |
| import os |
| |
| |
| |
| |
| |
| |
|
|
| from text import symbols as symbols_v1 |
| from text import symbols2 as symbols_v2 |
|
|
| special = [ |
| |
| ("¥", "zh", "SP2"), |
| ("^", "zh", "SP3"), |
| |
| ] |
|
|
|
|
| def clean_text(text, language, version=None): |
| if version is None:version=os.environ.get('version', 'v2') |
| if version == "v1": |
| symbols = symbols_v1.symbols |
| language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} |
| else: |
| symbols = symbols_v2.symbols |
| language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"} |
|
|
| if(language not in language_module_map): |
| language="en" |
| text=" " |
| for special_s, special_l, target_symbol in special: |
| if special_s in text and language == special_l: |
| return clean_special(text, language, special_s, target_symbol, version) |
| language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]]) |
| if hasattr(language_module,"text_normalize"): |
| norm_text = language_module.text_normalize(text) |
| else: |
| norm_text=text |
| if language == "zh" or language=="yue": |
| phones, word2ph = language_module.g2p(norm_text) |
| assert len(phones) == sum(word2ph) |
| assert len(norm_text) == len(word2ph) |
| elif language == "en": |
| phones = language_module.g2p(norm_text) |
| if len(phones) < 4: |
| phones = [','] * (4 - len(phones)) + phones |
| word2ph = None |
| else: |
| phones = language_module.g2p(norm_text) |
| word2ph = None |
|
|
| for ph in phones: |
| phones = ['UNK' if ph not in symbols else ph for ph in phones] |
| return phones, word2ph, norm_text |
|
|
|
|
| def clean_special(text, language, special_s, target_symbol, version=None): |
| if version is None:version=os.environ.get('version', 'v2') |
| if version == "v1": |
| symbols = symbols_v1.symbols |
| language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} |
| else: |
| symbols = symbols_v2.symbols |
| language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"} |
|
|
| """ |
| 特殊静音段sp符号处理 |
| """ |
| text = text.replace(special_s, ",") |
| language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]]) |
| norm_text = language_module.text_normalize(text) |
| phones = language_module.g2p(norm_text) |
| new_ph = [] |
| for ph in phones[0]: |
| assert ph in symbols |
| if ph == ",": |
| new_ph.append(target_symbol) |
| else: |
| new_ph.append(ph) |
| return new_ph, phones[1], norm_text |
|
|
|
|
| def text_to_sequence(text, language, version=None): |
| version = os.environ.get('version',version) |
| if version is None:version='v2' |
| phones = clean_text(text) |
| return cleaned_text_to_sequence(phones, version) |
|
|
|
|
| if __name__ == "__main__": |
| print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh")) |
|
|