import re import sys import nltk # Ensure NLTK data is available try: nltk.data.find('taggers/averaged_perceptron_tagger_eng') except LookupError: print("[g2p] Downloading missing NLTK resource: averaged_perceptron_tagger_eng", file=sys.stderr) nltk.download('averaged_perceptron_tagger_eng', quiet=True) try: nltk.data.find('corpora/cmudict') except LookupError: print("[g2p] Downloading missing NLTK resource: cmudict", file=sys.stderr) nltk.download('cmudict', quiet=True) import ToJyutping from g2pM import G2pM from g2p_en import G2p as G2pE _EN_WORD_RE = re.compile(r"^[A-Za-z]+(?:'[A-Za-z]+)*$") _ZH_WORD_RE = re.compile(r"[\u4e00-\u9fff]") EN_FLAG = "en_" YUE_FLAG = "yue_" ZH_FLAG = "zh_" g2p_zh = G2pM() g2p_en = G2pE() def is_chinese_char(word: str) -> bool: if len(word) != 1: return False return bool(_ZH_WORD_RE.fullmatch(word)) def is_english_word(word: str) -> bool: if not word: return False return bool(_EN_WORD_RE.fullmatch(word)) def g2p_cantonese(sent): return ToJyutping.get_jyutping_list(sent) # with tone def g2p_mandarin(sent): return g2p_zh(sent, tone=True, char_split=False) def g2p_english(word): return g2p_en(word) def g2p_transform(words, lang): zh_words = [] transformed_words = [0] * len(words) for idx, w in enumerate(words): if w == "": transformed_words[idx] = w continue w = w.replace("?", "").replace(".", "").replace("!", "").replace(",", "") if is_chinese_char(w): zh_words.append([idx, w]) else: if is_english_word(w): w = EN_FLAG + "-".join(g2p_english(w.lower())) else: w = "" transformed_words[idx] = w sent = "".join([k[1] for k in zh_words]) # zh (zh and yue) transformer to g2p if len(sent) > 0: if lang == "Cantonese": g2pm_rst = g2p_cantonese(sent) # with tone g2pm_rst = [YUE_FLAG + k[1] for k in g2pm_rst] else: g2pm_rst = g2p_mandarin(sent) g2pm_rst = [ZH_FLAG + k for k in g2pm_rst] for p, w in zip([k[0] for k in zh_words], g2pm_rst): transformed_words[p] = w return transformed_words