multimodalart's picture
Update preprocess/tools/g2p.py
0196c78 verified
import re
import sys
import nltk
# Ensure NLTK data is available
try:
nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError:
print("[g2p] Downloading missing NLTK resource: averaged_perceptron_tagger_eng", file=sys.stderr)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
try:
nltk.data.find('corpora/cmudict')
except LookupError:
print("[g2p] Downloading missing NLTK resource: cmudict", file=sys.stderr)
nltk.download('cmudict', quiet=True)
import ToJyutping
from g2pM import G2pM
from g2p_en import G2p as G2pE
_EN_WORD_RE = re.compile(r"^[A-Za-z]+(?:'[A-Za-z]+)*$")
_ZH_WORD_RE = re.compile(r"[\u4e00-\u9fff]")
EN_FLAG = "en_"
YUE_FLAG = "yue_"
ZH_FLAG = "zh_"
g2p_zh = G2pM()
g2p_en = G2pE()
def is_chinese_char(word: str) -> bool:
if len(word) != 1:
return False
return bool(_ZH_WORD_RE.fullmatch(word))
def is_english_word(word: str) -> bool:
if not word:
return False
return bool(_EN_WORD_RE.fullmatch(word))
def g2p_cantonese(sent):
return ToJyutping.get_jyutping_list(sent) # with tone
def g2p_mandarin(sent):
return g2p_zh(sent, tone=True, char_split=False)
def g2p_english(word):
return g2p_en(word)
def g2p_transform(words, lang):
zh_words = []
transformed_words = [0] * len(words)
for idx, w in enumerate(words):
if w == "<SP>":
transformed_words[idx] = w
continue
w = w.replace("?", "").replace(".", "").replace("!", "").replace(",", "")
if is_chinese_char(w):
zh_words.append([idx, w])
else:
if is_english_word(w):
w = EN_FLAG + "-".join(g2p_english(w.lower()))
else:
w = "<SP>"
transformed_words[idx] = w
sent = "".join([k[1] for k in zh_words])
# zh (zh and yue) transformer to g2p
if len(sent) > 0:
if lang == "Cantonese":
g2pm_rst = g2p_cantonese(sent) # with tone
g2pm_rst = [YUE_FLAG + k[1] for k in g2pm_rst]
else:
g2pm_rst = g2p_mandarin(sent)
g2pm_rst = [ZH_FLAG + k for k in g2pm_rst]
for p, w in zip([k[0] for k in zh_words], g2pm_rst):
transformed_words[p] = w
return transformed_words