| | import os |
| | import re |
| |
|
| | import cn2an |
| | from pypinyin import lazy_pinyin, Style |
| |
|
| | from .symbols import punctuation |
| | from .tone_sandhi import ToneSandhi |
| |
|
| | current_file_path = os.path.dirname(__file__) |
| | pinyin_to_symbol_map = { |
| | line.split("\t")[0]: line.strip().split("\t")[1] |
| | for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() |
| | } |
| |
|
| | import jieba.posseg as psg |
| |
|
| |
|
| | rep_map = { |
| | ":": ",", |
| | ";": ",", |
| | ",": ",", |
| | "。": ".", |
| | "!": "!", |
| | "?": "?", |
| | "\n": ".", |
| | "·": ",", |
| | "、": ",", |
| | "...": "…", |
| | "$": ".", |
| | "“": "'", |
| | "”": "'", |
| | "‘": "'", |
| | "’": "'", |
| | "(": "'", |
| | ")": "'", |
| | "(": "'", |
| | ")": "'", |
| | "《": "'", |
| | "》": "'", |
| | "【": "'", |
| | "】": "'", |
| | "[": "'", |
| | "]": "'", |
| | "—": "-", |
| | "~": "-", |
| | "~": "-", |
| | "「": "'", |
| | "」": "'", |
| | } |
| |
|
| | tone_modifier = ToneSandhi() |
| |
|
| |
|
| | def replace_punctuation(text): |
| | text = text.replace("嗯", "恩").replace("呣", "母") |
| | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) |
| |
|
| | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) |
| |
|
| | replaced_text = re.sub( |
| | r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text |
| | ) |
| |
|
| | return replaced_text |
| |
|
| |
|
| | def g2p(text): |
| | pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) |
| | sentences = [i for i in re.split(pattern, text) if i.strip() != ""] |
| | phones, tones, word2ph = _g2p(sentences) |
| | assert sum(word2ph) == len(phones) |
| | assert len(word2ph) == len(text) |
| | phones = ["_"] + phones + ["_"] |
| | tones = [0] + tones + [0] |
| | word2ph = [1] + word2ph + [1] |
| | return phones, tones, word2ph |
| |
|
| |
|
| | def _get_initials_finals(word): |
| | initials = [] |
| | finals = [] |
| | orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) |
| | orig_finals = lazy_pinyin( |
| | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 |
| | ) |
| | for c, v in zip(orig_initials, orig_finals): |
| | initials.append(c) |
| | finals.append(v) |
| | return initials, finals |
| |
|
| |
|
| | def _g2p(segments): |
| | phones_list = [] |
| | tones_list = [] |
| | word2ph = [] |
| | for seg in segments: |
| | |
| | seg = re.sub("[a-zA-Z]+", "", seg) |
| | seg_cut = psg.lcut(seg) |
| | initials = [] |
| | finals = [] |
| | seg_cut = tone_modifier.pre_merge_for_modify(seg_cut) |
| | for word, pos in seg_cut: |
| | if pos == "eng": |
| | import pdb; pdb.set_trace() |
| | continue |
| | sub_initials, sub_finals = _get_initials_finals(word) |
| | sub_finals = tone_modifier.modified_tone(word, pos, sub_finals) |
| | initials.append(sub_initials) |
| | finals.append(sub_finals) |
| |
|
| | |
| | initials = sum(initials, []) |
| | finals = sum(finals, []) |
| | |
| | for c, v in zip(initials, finals): |
| | raw_pinyin = c + v |
| | |
| | |
| | if c == v: |
| | assert c in punctuation |
| | phone = [c] |
| | tone = "0" |
| | word2ph.append(1) |
| | else: |
| | v_without_tone = v[:-1] |
| | tone = v[-1] |
| |
|
| | pinyin = c + v_without_tone |
| | assert tone in "12345" |
| |
|
| | if c: |
| | |
| | v_rep_map = { |
| | "uei": "ui", |
| | "iou": "iu", |
| | "uen": "un", |
| | } |
| | if v_without_tone in v_rep_map.keys(): |
| | pinyin = c + v_rep_map[v_without_tone] |
| | else: |
| | |
| | pinyin_rep_map = { |
| | "ing": "ying", |
| | "i": "yi", |
| | "in": "yin", |
| | "u": "wu", |
| | } |
| | if pinyin in pinyin_rep_map.keys(): |
| | pinyin = pinyin_rep_map[pinyin] |
| | else: |
| | single_rep_map = { |
| | "v": "yu", |
| | "e": "e", |
| | "i": "y", |
| | "u": "w", |
| | } |
| | if pinyin[0] in single_rep_map.keys(): |
| | pinyin = single_rep_map[pinyin[0]] + pinyin[1:] |
| |
|
| | assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin) |
| | phone = pinyin_to_symbol_map[pinyin].split(" ") |
| | word2ph.append(len(phone)) |
| |
|
| | phones_list += phone |
| | tones_list += [int(tone)] * len(phone) |
| | return phones_list, tones_list, word2ph |
| |
|
| |
|
| | def text_normalize(text): |
| | numbers = re.findall(r"\d+(?:\.?\d+)?", text) |
| | for number in numbers: |
| | text = text.replace(number, cn2an.an2cn(number), 1) |
| | text = replace_punctuation(text) |
| | return text |
| |
|
| |
|
| | def get_bert_feature(text, word2ph, device=None): |
| | from text import chinese_bert |
| |
|
| | return chinese_bert.get_bert_feature(text, word2ph, device=device) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | from text.chinese_bert import get_bert_feature |
| |
|
| | text = "啊!chemistry 但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏" |
| | text = text_normalize(text) |
| | print(text) |
| | phones, tones, word2ph = g2p(text) |
| | bert = get_bert_feature(text, word2ph) |
| |
|
| | print(phones, tones, word2ph, bert.shape) |
| |
|
| |
|
| | |
| | |
| | |
| |
|