Spaces:
Running
Running
File size: 9,029 Bytes
7b0a02f d85bd27 7b0a02f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 |
from . import utils
import jieba
import types
import yaml
import os
jieba.load_userdict('./dict_data/word_dict/jieba_cut.txt')
class pyPengIm():
def __init__(self, history=False) -> None:
self._dict_paths = {
"vocab": "./dict_data/vocab/origin_vocab.txt",
"vocab_extension": "./dict_data/vocab/vocab_extension.txt",
"word_dict": "./dict_data/word_dict/dict.txt",
"teochew_word_dict": "./dict_data/word_dict/teochew_local_dict.txt",
"translation_dict": "./dict_data/word_dict/madr_to_tch.txt",
"surname_dict": "./dict_data/vocab/Surname.txt",
"IPA_dict": "./dict_data/vocab/IPA_lexicon.txt",
# "phoneme_dict": "./dict_data/vocab/phone.txt",
"low_fre_dict": "./dict_data/vocab/low_fre.txt"
}
self.accent_dict = self._load_accent()
self._loaded_dicts = {}
# 是否启用中国历史词典,以支持古代年号、政权、官职、人名、民族等
if history:
self.word_dict.update(utils.load_dict("./dict_data/word_dict/history.txt"))
self.word_dict.update(utils.load_dict("./dict_data/word_dict/reign_title.txt"))
jieba.cut('')# 预热
def __getattr__(self, name):
if name in self._dict_paths:
if name not in self._loaded_dicts:
self._loaded_dicts[name] = utils.load_dict(self._dict_paths[name])
return self._loaded_dicts[name]
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
def _load_accent(self,accent_config_path="./dict_data/accent_convert/accent.yaml"):
with open(accent_config_path, 'r', encoding='utf-8') as file:
accent_config = yaml.safe_load(file)
accent_dict = {}
for k,v in accent_config.items():
accent_dict[k] = (
utils.load_dict(os.path.join("./dict_data/accent_convert",v['path'])),
v['name']
)
return accent_dict
def pinyin(self, text, heteronym=False, accent='', auto_split=True):
text = text.upper()
if heteronym:
pinyin_list = self._pinyin_heteronym(text)
else:
if auto_split:
pinyin_list = self.pinyin_optimize(utils.preprocess_generator(self.sentence_cut(text)))
else:
pinyin_list = self.pinyin_optimize(text.split(' '))
if accent in self.accent_dict:
pinyin_list = self.convert_accent(pinyin_list, accent)
surname_list = self._surname_notice(text)
return {
'result': pinyin_list,
'pinyin_seq': self._to_pinyin_sequence(pinyin_list),
'surname_notice': surname_list
}
def sentence_cut(self, text):
new_text_list = []
for ch in text:
if ch not in self.vocab and ch not in self.vocab_extension:
new_text_list.append(' {} '.format(ch))
else:
new_text_list.append(ch)
return jieba.cut("".join(new_text_list))
def _to_pinyin_sequence(self, pinyin_list):
result = []
for item in pinyin_list:
for pinyin in item[1:]:
if pinyin != ['None']:
result.append('|'.join([py.replace('*', '') for py in pinyin]))
else:
result.append(item[0]) ## 非法字符,原样输出
return ' '.join(result)
def _surname_notice(self, text):
result = []
for ch in text:
if ch in self.surname_dict.keys():
item = [ch, self._to_pinyin_list(self.surname_dict[ch])]
result.append(item)
return result
def _pinyin_heteronym(self, text):
result = []
text = text.replace('#', '')
for zh_char in text:
item = []
if zh_char in self.vocab.keys():
item.extend(self._to_pinyin_list(self.vocab[zh_char]))
if zh_char in self.vocab_extension.keys():
item.extend(self._to_pinyin_list(self.vocab_extension[zh_char]))
if zh_char in self.low_fre_dict.keys():
item.extend(self._to_pinyin_list(self.low_fre_dict[zh_char]))
result.append([zh_char, item])
return result
def _to_pinyin_list(self, pinyin_item):
return pinyin_item.split('|') if '|' in pinyin_item else [pinyin_item]
def _word_to_pinyin(self, item_word):
if ' ' in item_word:
return [self._to_pinyin_list(pinyin) for pinyin in item_word.split(' ')]
return [[item_word]]
def pinyin_optimize(self, word_list):
result = []
for word in word_list:
if not word.strip():
continue
word_translate_flag = word.endswith('#')
word = word.rstrip('#')
word_found_flag = False
item = [word]
if word in self.teochew_word_dict.keys():
item.extend(self._word_to_pinyin(self.teochew_word_dict[word]))
word_found_flag = True
if not word_translate_flag and word in self.word_dict:
item = [word]
item.extend(self._word_to_pinyin(self.word_dict[word])) # 在非翻译模式下,清空之前的拼音,仅保留 word_dict 结果,也就是只保存普通话语义
word_found_flag = True
elif not word_translate_flag or not word_found_flag:
if word in self.word_dict:
item.extend(self._word_to_pinyin(self.word_dict[word]))
word_found_flag = True
if not word_found_flag:
for zh_char in word:
if zh_char in self.vocab.keys():
item.append(self._to_pinyin_list(self.vocab[zh_char]))
elif zh_char in self.vocab_extension.keys():
item.append(self._to_pinyin_list(self.vocab_extension[zh_char]))
else:
item.append(self._to_pinyin_list('None'))
result.append(item)
return result
def convert_accent(self, pinyin_list, accent):
target_vocab = self.accent_dict[accent][0]
result = []
for one_pair in pinyin_list:
word, pinyins = one_pair[0], list(one_pair[1:])
item = [word]
for i, hanzi in enumerate(word):
pronunciations = []
for pronunciation in pinyins[i]:
query_item = f'{hanzi}_{pronunciation}'
target_accent = target_vocab.get(query_item, pronunciation)
pronunciations.extend(self._to_pinyin_list(target_accent))
item.append(list(dict.fromkeys(pronunciations)))# 去重
result.append(item)
return result
def to_IPA(self, pinyin_seq, blank=True):
if blank:
split_char = ' '
else:
split_char = ''
result = []
for pinyin in pinyin_seq.split(' '):
if '|' in pinyin:
ipa_item = []
for py in pinyin.split('|'):
ph_list = utils.pinyin_to_phoneme_list(py)
ipa_item.append(split_char.join([self.IPA_dict[ph] if ph in self.IPA_dict else ph for ph in ph_list]))
result.append("|".join(ipa_item))
else:
ph_list = utils.pinyin_to_phoneme_list(pinyin)
result.append(split_char.join(self.IPA_dict[ph] if ph in self.IPA_dict else ph for ph in ph_list))
return result
def to_phoneme(self, pinyin_seq):
return ['|'.join([utils.pinyin_to_phoneme(py) for py in pinyin.split('|')]) if '|' in pinyin else utils.pinyin_to_phoneme(pinyin) for pinyin in pinyin_seq.split(' ')]
def to_oral(self, text, auto_split=True):
if isinstance(text, list) or isinstance(text, types.GeneratorType):
word_list = text
elif isinstance(text, str):
word_list = jieba.cut(text) if auto_split else text.split(' ')
else:
return None
return ' '.join([self.translation_dict.get(word, word) + '#' if word in self.translation_dict else word for word in word_list])
def add_word_mapping(self, user_mapping: dict):
self.translation_dict.update(user_mapping)
# 查询单个字在不同地区的口音
def single_query(self, single_char):
if single_char not in self.vocab and single_char not in self.vocab_extension:
return None
if len(single_char) > 1:
return None
result_dict = {}
pinyin_list = self._pinyin_heteronym(single_char)
result_dict['府城'] = pinyin_list[0]
for k,v in self.accent_dict.items():
result_dict[v[1]] = self.convert_accent(pinyin_list, accent=k)[0]
return result_dict |