|
|
|
|
| import sys
|
| import re
|
| import cn2an
|
|
|
| from pyjyutping import jyutping
|
| from text.symbols import punctuation
|
| from text.zh_normalization.text_normlization import TextNormalizer
|
|
|
| normalizer = lambda x: cn2an.transform(x, "an2cn")
|
|
|
| INITIALS = [
|
| "aa",
|
| "aai",
|
| "aak",
|
| "aap",
|
| "aat",
|
| "aau",
|
| "ai",
|
| "au",
|
| "ap",
|
| "at",
|
| "ak",
|
| "a",
|
| "p",
|
| "b",
|
| "e",
|
| "ts",
|
| "t",
|
| "dz",
|
| "d",
|
| "kw",
|
| "k",
|
| "gw",
|
| "g",
|
| "f",
|
| "h",
|
| "l",
|
| "m",
|
| "ng",
|
| "n",
|
| "s",
|
| "y",
|
| "w",
|
| "c",
|
| "z",
|
| "j",
|
| "ong",
|
| "on",
|
| "ou",
|
| "oi",
|
| "ok",
|
| "o",
|
| "uk",
|
| "ung",
|
| ]
|
| INITIALS += ["sp", "spl", "spn", "sil"]
|
|
|
|
|
| rep_map = {
|
| ":": ",",
|
| ";": ",",
|
| ",": ",",
|
| "。": ".",
|
| "!": "!",
|
| "?": "?",
|
| "\n": ".",
|
| "·": ",",
|
| "、": ",",
|
| "...": "…",
|
| "$": ".",
|
| "“": "'",
|
| "”": "'",
|
| '"': "'",
|
| "‘": "'",
|
| "’": "'",
|
| "(": "'",
|
| ")": "'",
|
| "(": "'",
|
| ")": "'",
|
| "《": "'",
|
| "》": "'",
|
| "【": "'",
|
| "】": "'",
|
| "[": "'",
|
| "]": "'",
|
| "—": "-",
|
| "~": "-",
|
| "~": "-",
|
| "「": "'",
|
| "」": "'",
|
| }
|
|
|
|
|
| def replace_punctuation(text):
|
|
|
| pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
|
|
| replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
|
|
| replaced_text = re.sub(
|
| r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
|
| )
|
|
|
| return replaced_text
|
|
|
|
|
| def text_normalize(text):
|
| tx = TextNormalizer()
|
| sentences = tx.normalize(text)
|
| dest_text = ""
|
| for sentence in sentences:
|
| dest_text += replace_punctuation(sentence)
|
| return dest_text
|
|
|
|
|
| punctuation_set=set(punctuation)
|
| def jyuping_to_initials_finals_tones(jyuping_syllables):
|
| initials_finals = []
|
| tones = []
|
| word2ph = []
|
|
|
| for syllable in jyuping_syllables:
|
| if syllable in punctuation:
|
| initials_finals.append(syllable)
|
| tones.append(0)
|
| word2ph.append(1)
|
| elif syllable == "_":
|
| initials_finals.append(syllable)
|
| tones.append(0)
|
| word2ph.append(1)
|
| else:
|
| try:
|
| tone = int(syllable[-1])
|
| syllable_without_tone = syllable[:-1]
|
| except ValueError:
|
| tone = 0
|
| syllable_without_tone = syllable
|
|
|
| for initial in INITIALS:
|
| if syllable_without_tone.startswith(initial):
|
| if syllable_without_tone.startswith("nga"):
|
| initials_finals.extend(
|
| [
|
| syllable_without_tone[:2],
|
| syllable_without_tone[2:] or syllable_without_tone[-1],
|
| ]
|
| )
|
|
|
| tones.extend([-1, tone])
|
| word2ph.append(2)
|
| else:
|
| final = syllable_without_tone[len(initial) :] or initial[-1]
|
| initials_finals.extend([initial, final])
|
|
|
| tones.extend([-1, tone])
|
| word2ph.append(2)
|
| break
|
| assert len(initials_finals) == len(tones)
|
|
|
|
|
| phones=[]
|
| for a,b in zip(initials_finals,tones):
|
| if(b not in [-1,0]):
|
| todo="%s%s"%(a,b)
|
| else:todo=a
|
| if(todo not in punctuation_set):todo="Y%s"%todo
|
| phones.append(todo)
|
|
|
|
|
| return phones, word2ph
|
|
|
|
|
| def get_jyutping(text):
|
| jp = jyutping.convert(text)
|
|
|
| for symbol in punctuation:
|
| jp = jp.replace(symbol, " " + symbol + " ")
|
| jp_array = jp.split()
|
| return jp_array
|
|
|
|
|
| def get_bert_feature(text, word2ph):
|
| from text import chinese_bert
|
|
|
| return chinese_bert.get_bert_feature(text, word2ph)
|
|
|
|
|
| def g2p(text):
|
|
|
| jyuping = get_jyutping(text)
|
|
|
|
|
| phones, word2ph = jyuping_to_initials_finals_tones(jyuping)
|
|
|
|
|
|
|
| return phones, word2ph
|
|
|
|
|
| if __name__ == "__main__":
|
|
|
| text = "佢個鋤頭太短啦。"
|
| text = text_normalize(text)
|
|
|
| phones, word2ph = g2p(text)
|
|
|
| print(phones, word2ph) |