Spaces:
Running
Running
| # Copyright 2024 Hung-Shin Lee (hungshinlee@gmail.com) | |
| # Apache 2.0 | |
| import re | |
| from pathlib import Path | |
| from unicodedata import normalize | |
| import jieba | |
| import opencc | |
| jieba.setLogLevel(20) | |
| jieba.re_han_default = re.compile("([\u2e80-\U000e01efa-zA-Z0-9+#&\._%\-']+)", re.U) | |
| s2tw_converter = opencc.OpenCC("s2tw.json") | |
| def update_jieba_dict( | |
| lexicon: list, | |
| jieba_dict_path: Path, | |
| high_freq_words: list = [], | |
| high_freq_words_weight: int = 10, | |
| ) -> list: | |
| lexicon = sorted(set(lexicon)) | |
| jieba_dict_path.unlink(missing_ok=True) | |
| Path("/tmp/jieba.cache").unlink(missing_ok=True) | |
| with jieba_dict_path.open("w", encoding="utf-8") as file: | |
| for word in lexicon: | |
| if word in high_freq_words: | |
| file.write(f"{word} {len(word) * high_freq_words_weight}\n") | |
| else: | |
| file.write(f"{word} {len(word)}\n") | |
| jieba.dt.initialized = False | |
| return lexicon | |
| def run_jieba(line: str) -> list: | |
| # NOTE JIEBA θηε€θ‘ζζ¬ηη΅ζζε€±ε»εζ¬ηθ‘η΅ζ§ | |
| seg_list = list(jieba.cut(line, cut_all=False, HMM=False)) | |
| return seg_list | |
| def normalize_text(text: str, replace_dict: dict, replace_regex: str) -> str: | |
| def replace_match(match): | |
| return replace_dict[match.group(0)] | |
| text = re.sub("\x08", "", text) | |
| text = re.sub("\ufeff", "", text) | |
| text = re.sub("\u0010", "", text) | |
| text = normalize("NFKC", text) | |
| text = re.sub(replace_regex, replace_match, text) | |
| text = " ".join(text.split()).upper() | |
| return text | |
| def apply_v2f(word_list: list, v2f_dict: dict, v2f_regex: str) -> list: | |
| result = [] | |
| for word in word_list: | |
| result.append(re.sub(v2f_regex, lambda x: v2f_dict[x.group(0)], word)) | |
| return result | |
| def prep_regex( | |
| delimiter_list: list, replace_dict: dict = {}, v2f_dict: dict = {} | |
| ) -> tuple[str, str, str]: | |
| delimiter_regex = "|".join(map(re.escape, delimiter_list)) | |
| replace_regex = "" | |
| if len(replace_dict): | |
| sorted_keys = sorted(replace_dict.keys(), key=len, reverse=True) | |
| replace_regex = "|".join(map(re.escape, sorted_keys)) | |
| v2f_regex = "" | |
| if len(v2f_dict): | |
| v2f_regex = "|".join(map(re.escape, v2f_dict.keys())) | |
| return delimiter_regex, replace_regex, v2f_regex | |