jhansss's picture
Replace kanjiconv dependency with pykakasi for broader Python compatibility
bc5d466
raw
history blame
4.91 kB
import json
import re
import warnings
from pathlib import Path
import pykakasi
from pypinyin import lazy_pinyin
from .resources.pinyin_dict import PINYIN_DICT
kks = pykakasi.kakasi()
yoon_map = {
"ใ": "ใ‚",
"ใƒ": "ใ„",
"ใ…": "ใ†",
"ใ‡": "ใˆ",
"ใ‰": "ใŠ",
"ใ‚ƒ": "ใ‚„",
"ใ‚…": "ใ‚†",
"ใ‚‡": "ใ‚ˆ",
"ใ‚Ž": "ใ‚",
}
# ACE_phonemes
with open(Path(__file__).parent / "resources" / "all_plans.json", "r") as f:
ace_phonemes_all_plans = json.load(f)
for plan in ace_phonemes_all_plans["plans"]:
if plan["language"] == "zh":
ace_phonemes_zh_plan = plan
break
def preprocess_text(text: str, language: str) -> list[str]:
if language == "mandarin":
text_list = to_pinyin(text)
text_list = [pinyin for pinyin in text_list if pinyin != " "]
elif language == "japanese":
text_list = to_kana(text)
else:
raise ValueError(f"Other languages are not supported")
return text_list
def to_pinyin(text: str) -> list[str]:
pinyin_list = lazy_pinyin(text)
text_list = []
for text in pinyin_list:
if text[0] == "S" or text[0] == "A" or text[0] == "-":
sp_strs = re.findall(r"-|AP|SP", text)
for phn in sp_strs:
text_list.append(phn)
else:
text_list.append(text)
return text_list
def replace_chouonpu(hiragana_text: str) -> str:
"""processใ€Œใƒผใ€since the previous packages didn't support"""
vowels = {
"ใ‚": "ใ‚",
"ใ„": "ใ„",
"ใ†": "ใ†",
"ใˆ": "ใˆ",
"ใŠ": "ใ†",
"ใ‹": "ใ‚",
"ใ": "ใ„",
"ใ": "ใ†",
"ใ‘": "ใˆ",
"ใ“": "ใ†",
"ใ•": "ใ‚",
"ใ—": "ใ„",
"ใ™": "ใ†",
"ใ›": "ใˆ",
"ใ": "ใ†",
"ใŸ": "ใ‚",
"ใก": "ใ„",
"ใค": "ใ†",
"ใฆ": "ใˆ",
"ใจ": "ใ†",
"ใช": "ใ‚",
"ใซ": "ใ„",
"ใฌ": "ใ†",
"ใญ": "ใˆ",
"ใฎ": "ใ†",
"ใฏ": "ใ‚",
"ใฒ": "ใ„",
"ใต": "ใ†",
"ใธ": "ใˆ",
"ใป": "ใ†",
"ใพ": "ใ‚",
"ใฟ": "ใ„",
"ใ‚€": "ใ†",
"ใ‚": "ใˆ",
"ใ‚‚": "ใ†",
"ใ‚„": "ใ‚",
"ใ‚†": "ใ†",
"ใ‚ˆ": "ใ†",
"ใ‚‰": "ใ‚",
"ใ‚Š": "ใ„",
"ใ‚‹": "ใ†",
"ใ‚Œ": "ใˆ",
"ใ‚": "ใ†",
"ใ‚": "ใ‚",
"ใ‚’": "ใ†",
}
new_text = []
for i, char in enumerate(hiragana_text):
if char == "ใƒผ" and i > 0:
prev_char = new_text[-1]
if prev_char in yoon_map:
prev_char = yoon_map[prev_char]
new_text.append(vowels.get(prev_char, prev_char))
else:
new_text.append(char)
return "".join(new_text)
def to_kana(text: str) -> list[str]:
hiragana_text = "".join(
[item["hira"] for item in kks.convert(text.replace(" ", ""))]
)
hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ")
final_ls = []
for subword in hiragana_text_wl:
sl_prev = 0
for i in range(len(subword) - 1):
if sl_prev >= len(subword) - 1:
break
sl = sl_prev + 1
if subword[sl] in yoon_map:
final_ls.append(subword[sl_prev : sl + 1])
sl_prev += 2
else:
final_ls.append(subword[sl_prev])
sl_prev += 1
final_ls.append(subword[sl_prev])
return final_ls
def kana_to_phonemes_openjtalk(kana: str) -> list[str]:
import pyopenjtalk
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
# add space between each character
kana = " ".join(list(kana))
# phones is a str object separated by space
phones = pyopenjtalk.g2p(kana, kana=False)
if len(w) > 0:
for warning in w:
if "No phoneme" in str(warning.message):
raise ValueError(f"No phoneme found for {kana}. {warning.message}")
phones = phones.split(" ")
return phones
def pinyin_to_phonemes_opencpop(pinyin: str) -> list[str]:
pinyin = pinyin.lower()
if pinyin in ace_phonemes_zh_plan["dict"]:
phns = ace_phonemes_zh_plan["dict"][pinyin]
return phns
elif pinyin in ace_phonemes_zh_plan["syllable_alias"]:
phns = ace_phonemes_zh_plan["dict"][
ace_phonemes_zh_plan["syllable_alias"][pinyin]
]
return phns
else:
raise ValueError(f"{pinyin} not registered in Opencpop phoneme dict")
def pinyin_to_phonemes_ace(pinyin: str) -> list[str]:
pinyin = pinyin.lower()
if pinyin in PINYIN_DICT:
phns = PINYIN_DICT[pinyin]
return phns
else:
raise ValueError(f"{pinyin} not registered in ACE phoneme dict")