Spaces:
Running
Running
| import os | |
| import re | |
| from pathlib import Path | |
| import jieba | |
| from omegaconf import OmegaConf | |
| from ipa.convert_digits import parse_num | |
| from ipa.proc_text import ( | |
| apply_v2f, | |
| normalize_text, | |
| prep_regex, | |
| run_jieba, | |
| update_jieba_dict, | |
| ) | |
| ipa_configs = OmegaConf.to_object(OmegaConf.load("configs/ipa.yaml")) | |
| for key in ipa_configs["preserved_list"]: | |
| ipa_configs["v2f_dict"].pop(key, None) | |
| delimiter_regex, replace_regex, v2f_regex = prep_regex( | |
| ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"] | |
| ) | |
| def get_ipa(raw_text: str, dialect: str) -> tuple[str, str, str, list[str]]: | |
| pinyin_split = re.split( | |
| r"([a-z]+\d+)", raw_text | |
| ) | |
| final_words = [] | |
| final_pinyin = [] | |
| final_ipa = [] | |
| final_missing_words = [] | |
| for hanzi_or_pinyin in pinyin_split: | |
| if len(hanzi_or_pinyin.strip()) == 0: | |
| continue | |
| if re.search(r"[a-z]+\d+", hanzi_or_pinyin): | |
| final_words.append(hanzi_or_pinyin) | |
| final_pinyin.append(hanzi_or_pinyin) | |
| pinyin, tone = re.match(r"([a-z]+)(\d+)?", hanzi_or_pinyin).groups() | |
| tone = f"_{tone}" if tone else "" | |
| ipa = parse_pinyin_to_ipa(pinyin) | |
| if ipa is None: | |
| final_missing_words.append(pinyin) | |
| continue | |
| final_ipa.append(ipa + tone) | |
| else: | |
| words, ipa, pinyin, missing_words = parse_hanzi_to_ipa( | |
| hanzi_or_pinyin, dialect | |
| ) | |
| final_words.extend(words) | |
| final_ipa.extend(ipa) | |
| final_pinyin.extend(pinyin) | |
| final_missing_words.extend(missing_words) | |
| if len(final_ipa) == 0 or len(final_missing_words) > 0: | |
| return final_words, final_ipa, final_pinyin, final_missing_words | |
| final_words = " ".join(final_words).replace(" , ", ",") | |
| final_ipa = " ".join(final_ipa).replace(" , ", ",") | |
| final_pinyin = " ".join(final_pinyin).replace(" , ", ",") | |
| return final_words, final_ipa, final_pinyin, final_missing_words | |
| def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space="")->list[str]: | |
| text = [] | |
| ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa) | |
| print(ipa_list) | |
| for word in ipa_list: | |
| if word.isdigit(): | |
| text.append(word) | |
| else: | |
| if len(as_space) > 0: | |
| word = re.sub(r"[{}]".format(as_space), " ", word) | |
| if len(delete_chars) > 0: | |
| word = re.sub(r"[{}]".format(delete_chars), "", word) | |
| word = word.replace(",", " , ") | |
| text.extend(word) | |
| return text | |
| def parse_pinyin_to_ipa(pinyin: str)->str|None: | |
| if pinyin not in ipa_configs["pinyin_to_ipa_dict"]: | |
| return None | |
| ipa_dict_result = ipa_configs["pinyin_to_ipa_dict"][pinyin] | |
| ipa = "+".join(ipa_dict_result).replace(" ", "-") | |
| return ipa | |
| def parse_hanzi_to_ipa( | |
| hanzi: str, dialect: str | |
| ) -> tuple[list[str], list[str], list[str], list[str]]: | |
| lexicon = ipa_configs["lexicon"][dialect] | |
| update_jieba_dict( | |
| list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt" | |
| ) | |
| text = normalize_text(hanzi, ipa_configs["replace_dict"], replace_regex) | |
| text = parse_num(text) | |
| text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()] | |
| text = ",".join(text_parts) | |
| word_list = run_jieba(text) | |
| word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex) | |
| word_list = run_jieba("".join(word_list)) | |
| final_words = [] | |
| final_pinyin = [] | |
| final_ipa = [] | |
| missing_words = [] | |
| for word in word_list: | |
| if not bool(word.strip()): | |
| continue | |
| if word == ",": | |
| final_words.append(",") | |
| final_pinyin.append(",") | |
| final_ipa.append(",") | |
| elif word not in lexicon: | |
| final_words.append(word) | |
| missing_words.append(word) | |
| else: | |
| final_words.append(f"{word}") | |
| final_pinyin.append(lexicon[word]["pinyin"][0]) | |
| # NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮 | |
| final_ipa.append(lexicon[word]["ipa"][0]) | |
| return final_words, final_ipa, final_pinyin, missing_words | |