Spaces:
Running
Running
| import re | |
| from typing import Dict, List | |
| from pypinyin import lazy_pinyin, Style | |
| from .custom_pypinyin_dict import phrase_pinyin_data | |
| import jieba | |
| from .cn2an import an2cn | |
| # 加载自定义拼音词典数据 | |
| phrase_pinyin_data.load() | |
| # 标点符号正则 | |
| PUNC_MAP: Dict[str, str] = { | |
| ":": ",", | |
| ";": ",", | |
| ",": ",", | |
| "。": ".", | |
| "!": "!", | |
| "?": "?", | |
| "\n": ".", | |
| "·": ",", | |
| "、": ",", | |
| "$": ".", | |
| "/": ",", | |
| "“": "'", | |
| "”": "'", | |
| '"': "'", | |
| "‘": "'", | |
| "’": "'", | |
| "(": "'", | |
| ")": "'", | |
| "(": "'", | |
| ")": "'", | |
| "《": "'", | |
| "》": "'", | |
| "【": "'", | |
| "】": "'", | |
| "[": "'", | |
| "]": "'", | |
| "—": "-", | |
| "~": "~", | |
| "「": "'", | |
| "」": "'", | |
| "『": "'", | |
| "』": "'", | |
| } | |
| # from GPT_SoVITS.text.zh_normalization.text_normlization | |
| PUNC_MAP.update ({ | |
| '/': '每', | |
| '①': '一', | |
| '②': '二', | |
| '③': '三', | |
| '④': '四', | |
| '⑤': '五', | |
| '⑥': '六', | |
| '⑦': '七', | |
| '⑧': '八', | |
| '⑨': '九', | |
| '⑩': '十', | |
| 'α': '阿尔法', | |
| 'β': '贝塔', | |
| 'γ': '伽玛', | |
| 'Γ': '伽玛', | |
| 'δ': '德尔塔', | |
| 'Δ': '德尔塔', | |
| 'ε': '艾普西龙', | |
| 'ζ': '捷塔', | |
| 'η': '依塔', | |
| 'θ': '西塔', | |
| 'Θ': '西塔', | |
| 'ι': '艾欧塔', | |
| 'κ': '喀帕', | |
| 'λ': '拉姆达', | |
| 'Λ': '拉姆达', | |
| 'μ': '缪', | |
| 'ν': '拗', | |
| 'ξ': '克西', | |
| 'Ξ': '克西', | |
| 'ο': '欧米克伦', | |
| 'π': '派', | |
| 'Π': '派', | |
| 'ρ': '肉', | |
| 'ς': '西格玛', | |
| 'σ': '西格玛', | |
| 'Σ': '西格玛', | |
| 'τ': '套', | |
| 'υ': '宇普西龙', | |
| 'φ': '服艾', | |
| 'Φ': '服艾', | |
| 'χ': '器', | |
| 'ψ': '普赛', | |
| 'Ψ': '普赛', | |
| 'ω': '欧米伽', | |
| 'Ω': '欧米伽', | |
| '+': '加', | |
| '-': '减', | |
| '×': '乘', | |
| '÷': '除', | |
| '=': '等', | |
| "嗯": "恩", | |
| "呣": "母" | |
| }) | |
| PUNC_TABLE = str.maketrans(PUNC_MAP) | |
| # 数字正则化 | |
| NUMBER_PATTERN: re.Pattern = re.compile(r'\d+(?:\.?\d+)?') | |
| # 阿拉伯数字转汉字 | |
| def replace_number(match: re.Match) -> str: | |
| return an2cn(match.group()) | |
| def normalize_number(text: str) -> str: | |
| return NUMBER_PATTERN.sub(replace_number, text) | |
| # get symbols of phones, not used | |
| def load_pinyin_symbols(path): | |
| pinyin_dict={} | |
| temp = [] | |
| with open(path, "r", encoding='utf-8') as f: | |
| content = f.readlines() | |
| for line in content: | |
| cuts = line.strip().split(',') | |
| pinyin = cuts[0] | |
| phones = cuts[1].split(' ') | |
| pinyin_dict[pinyin] = phones | |
| temp.extend(phones) | |
| temp = list(set(temp)) | |
| tone = [] | |
| for phone in temp: | |
| for i in range(1, 6): | |
| phone2 = phone + str(i) | |
| tone.append(phone2) | |
| print(sorted(tone, key=lambda x: len(x))) | |
| return pinyin_dict | |
| def load_pinyin_dict(path: str) -> Dict[str, List[str]]: | |
| pinyin_dict = {} | |
| with open(path, "r", encoding='utf-8') as f: | |
| for line in f: | |
| key, value = line.strip().split(',', 1) | |
| pinyin_dict[key] = value.split() | |
| return pinyin_dict | |
| import os | |
| pinyin_dict = load_pinyin_dict(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cnm3', 'ds_CNM3.txt')) | |
| # pinyin_dict = load_pinyin_dict('text/cnm3/ds_CNM3.txt') | |
| def chinese_to_cnm3(text: str) -> List[str]: | |
| # 标点符号和数字正则化 | |
| text = text.translate(PUNC_TABLE) | |
| text = normalize_number(text) | |
| # 过滤掉特殊字符 | |
| text = re.sub(r'[#&@“”^_|\\]', '', text) | |
| words = jieba.lcut(text, cut_all=False) | |
| phones = [] | |
| for word in words: | |
| pinyin_list: List[str] = lazy_pinyin(word, style=Style.TONE3, neutral_tone_with_five=True) | |
| for pinyin in pinyin_list: | |
| if pinyin[-1].isdigit(): | |
| tone = pinyin[-1] | |
| syllable = pinyin[:-1] | |
| phone = pinyin_dict[syllable] | |
| phones.extend([ph + tone for ph in phone]) | |
| elif pinyin[-1].isalpha(): | |
| pass | |
| else: | |
| phones.extend(pinyin) | |
| return phones |