Spaces:
Runtime error
Runtime error
| import re | |
| from pypinyin import lazy_pinyin, Style | |
| from .custom_pypinyin_dict import phrase_pinyin_data | |
| phrase_pinyin_data.load() | |
| import jieba | |
| from .cn2an import an2cn | |
| # ζ ηΉη¬¦ε·ζ£ε | |
| punc_map = { | |
| "οΌ": ",", | |
| "οΌ": ",", | |
| "οΌ": ",", | |
| "γ": ".", | |
| "οΌ": "!", | |
| "οΌ": "?", | |
| "\n": ".", | |
| "Β·": ",", | |
| "γ": ",", | |
| "$": ".", | |
| "β": "'", | |
| "β": "'", | |
| '"': "'", | |
| "β": "'", | |
| "β": "'", | |
| "οΌ": "'", | |
| "οΌ": "'", | |
| "(": "'", | |
| ")": "'", | |
| "γ": "'", | |
| "γ": "'", | |
| "γ": "'", | |
| "γ": "'", | |
| "[": "'", | |
| "]": "'", | |
| "β": "-", | |
| "ο½": "~", | |
| "γ": "'", | |
| "γ": "'", | |
| "γ": "'", | |
| "γ": "'", | |
| } | |
| punc_table = str.maketrans(punc_map) | |
| # ζ°εζ£εε | |
| number_pattern = re.compile(r'\d+(?:\.?\d+)?') | |
| def replace_number(match): | |
| return an2cn(match.group()) | |
| def normalize_number(text): | |
| return number_pattern.sub(replace_number, text) | |
| # get symbols of phones | |
| def load_pinyin_symbols(path): | |
| pinyin_dict={} | |
| temp = [] | |
| with open(path, "r", encoding='utf-8') as f: | |
| content = f.readlines() | |
| for line in content: | |
| cuts = line.strip().split(',') | |
| pinyin = cuts[0] | |
| phones = cuts[1].split(' ') | |
| pinyin_dict[pinyin] = phones | |
| temp.extend(phones) | |
| temp = list(set(temp)) | |
| tone = [] | |
| for phone in temp: | |
| for i in range(1, 6): | |
| phone2 = phone + str(i) | |
| tone.append(phone2) | |
| print(sorted(tone, key=lambda x: len(x))) | |
| return pinyin_dict | |
| def load_pinyin_dict(path): | |
| pinyin_dict = {} | |
| with open(path, "r", encoding='utf-8') as f: | |
| for line in f: | |
| key, value = line.strip().split(',', 1) | |
| pinyin_dict[key] = value.split() | |
| return pinyin_dict | |
| pinyin_dict = load_pinyin_dict('text/cnm3/ds_CNM3.txt') | |
| def chinese_to_cnm3(text: str): | |
| text = text.translate(punc_table) | |
| text = normalize_number(text) | |
| words = jieba.lcut(text, cut_all=False) | |
| phones = [] | |
| for word in words: | |
| pinyin_list = lazy_pinyin(word, style=Style.TONE3, neutral_tone_with_five=True) | |
| for pinyin in pinyin_list: | |
| if pinyin[-1].isdigit(): | |
| tone = pinyin[-1] | |
| syllable = pinyin[:-1] | |
| phone = pinyin_dict[syllable] | |
| phones.extend([ph + tone for ph in phone]) | |
| elif pinyin[-1].isalpha(): | |
| pass | |
| else: | |
| phones.extend(pinyin) | |
| return phones |