teochew_pyPengIm / script /pyPengIm.py
panlr's picture
fix error
d85bd27
from . import utils
import jieba
import types
import yaml
import os
jieba.load_userdict('./dict_data/word_dict/jieba_cut.txt')
class pyPengIm():
def __init__(self, history=False) -> None:
self._dict_paths = {
"vocab": "./dict_data/vocab/origin_vocab.txt",
"vocab_extension": "./dict_data/vocab/vocab_extension.txt",
"word_dict": "./dict_data/word_dict/dict.txt",
"teochew_word_dict": "./dict_data/word_dict/teochew_local_dict.txt",
"translation_dict": "./dict_data/word_dict/madr_to_tch.txt",
"surname_dict": "./dict_data/vocab/Surname.txt",
"IPA_dict": "./dict_data/vocab/IPA_lexicon.txt",
# "phoneme_dict": "./dict_data/vocab/phone.txt",
"low_fre_dict": "./dict_data/vocab/low_fre.txt"
}
self.accent_dict = self._load_accent()
self._loaded_dicts = {}
# 是否启用中国历史词典,以支持古代年号、政权、官职、人名、民族等
if history:
self.word_dict.update(utils.load_dict("./dict_data/word_dict/history.txt"))
self.word_dict.update(utils.load_dict("./dict_data/word_dict/reign_title.txt"))
jieba.cut('')# 预热
def __getattr__(self, name):
if name in self._dict_paths:
if name not in self._loaded_dicts:
self._loaded_dicts[name] = utils.load_dict(self._dict_paths[name])
return self._loaded_dicts[name]
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
def _load_accent(self,accent_config_path="./dict_data/accent_convert/accent.yaml"):
with open(accent_config_path, 'r', encoding='utf-8') as file:
accent_config = yaml.safe_load(file)
accent_dict = {}
for k,v in accent_config.items():
accent_dict[k] = (
utils.load_dict(os.path.join("./dict_data/accent_convert",v['path'])),
v['name']
)
return accent_dict
def pinyin(self, text, heteronym=False, accent='', auto_split=True):
text = text.upper()
if heteronym:
pinyin_list = self._pinyin_heteronym(text)
else:
if auto_split:
pinyin_list = self.pinyin_optimize(utils.preprocess_generator(self.sentence_cut(text)))
else:
pinyin_list = self.pinyin_optimize(text.split(' '))
if accent in self.accent_dict:
pinyin_list = self.convert_accent(pinyin_list, accent)
surname_list = self._surname_notice(text)
return {
'result': pinyin_list,
'pinyin_seq': self._to_pinyin_sequence(pinyin_list),
'surname_notice': surname_list
}
def sentence_cut(self, text):
new_text_list = []
for ch in text:
if ch not in self.vocab and ch not in self.vocab_extension:
new_text_list.append(' {} '.format(ch))
else:
new_text_list.append(ch)
return jieba.cut("".join(new_text_list))
def _to_pinyin_sequence(self, pinyin_list):
result = []
for item in pinyin_list:
for pinyin in item[1:]:
if pinyin != ['None']:
result.append('|'.join([py.replace('*', '') for py in pinyin]))
else:
result.append(item[0]) ## 非法字符,原样输出
return ' '.join(result)
def _surname_notice(self, text):
result = []
for ch in text:
if ch in self.surname_dict.keys():
item = [ch, self._to_pinyin_list(self.surname_dict[ch])]
result.append(item)
return result
def _pinyin_heteronym(self, text):
result = []
text = text.replace('#', '')
for zh_char in text:
item = []
if zh_char in self.vocab.keys():
item.extend(self._to_pinyin_list(self.vocab[zh_char]))
if zh_char in self.vocab_extension.keys():
item.extend(self._to_pinyin_list(self.vocab_extension[zh_char]))
if zh_char in self.low_fre_dict.keys():
item.extend(self._to_pinyin_list(self.low_fre_dict[zh_char]))
result.append([zh_char, item])
return result
def _to_pinyin_list(self, pinyin_item):
return pinyin_item.split('|') if '|' in pinyin_item else [pinyin_item]
def _word_to_pinyin(self, item_word):
if ' ' in item_word:
return [self._to_pinyin_list(pinyin) for pinyin in item_word.split(' ')]
return [[item_word]]
def pinyin_optimize(self, word_list):
result = []
for word in word_list:
if not word.strip():
continue
word_translate_flag = word.endswith('#')
word = word.rstrip('#')
word_found_flag = False
item = [word]
if word in self.teochew_word_dict.keys():
item.extend(self._word_to_pinyin(self.teochew_word_dict[word]))
word_found_flag = True
if not word_translate_flag and word in self.word_dict:
item = [word]
item.extend(self._word_to_pinyin(self.word_dict[word])) # 在非翻译模式下,清空之前的拼音,仅保留 word_dict 结果,也就是只保存普通话语义
word_found_flag = True
elif not word_translate_flag or not word_found_flag:
if word in self.word_dict:
item.extend(self._word_to_pinyin(self.word_dict[word]))
word_found_flag = True
if not word_found_flag:
for zh_char in word:
if zh_char in self.vocab.keys():
item.append(self._to_pinyin_list(self.vocab[zh_char]))
elif zh_char in self.vocab_extension.keys():
item.append(self._to_pinyin_list(self.vocab_extension[zh_char]))
else:
item.append(self._to_pinyin_list('None'))
result.append(item)
return result
def convert_accent(self, pinyin_list, accent):
target_vocab = self.accent_dict[accent][0]
result = []
for one_pair in pinyin_list:
word, pinyins = one_pair[0], list(one_pair[1:])
item = [word]
for i, hanzi in enumerate(word):
pronunciations = []
for pronunciation in pinyins[i]:
query_item = f'{hanzi}_{pronunciation}'
target_accent = target_vocab.get(query_item, pronunciation)
pronunciations.extend(self._to_pinyin_list(target_accent))
item.append(list(dict.fromkeys(pronunciations)))# 去重
result.append(item)
return result
def to_IPA(self, pinyin_seq, blank=True):
if blank:
split_char = ' '
else:
split_char = ''
result = []
for pinyin in pinyin_seq.split(' '):
if '|' in pinyin:
ipa_item = []
for py in pinyin.split('|'):
ph_list = utils.pinyin_to_phoneme_list(py)
ipa_item.append(split_char.join([self.IPA_dict[ph] if ph in self.IPA_dict else ph for ph in ph_list]))
result.append("|".join(ipa_item))
else:
ph_list = utils.pinyin_to_phoneme_list(pinyin)
result.append(split_char.join(self.IPA_dict[ph] if ph in self.IPA_dict else ph for ph in ph_list))
return result
def to_phoneme(self, pinyin_seq):
return ['|'.join([utils.pinyin_to_phoneme(py) for py in pinyin.split('|')]) if '|' in pinyin else utils.pinyin_to_phoneme(pinyin) for pinyin in pinyin_seq.split(' ')]
def to_oral(self, text, auto_split=True):
if isinstance(text, list) or isinstance(text, types.GeneratorType):
word_list = text
elif isinstance(text, str):
word_list = jieba.cut(text) if auto_split else text.split(' ')
else:
return None
return ' '.join([self.translation_dict.get(word, word) + '#' if word in self.translation_dict else word for word in word_list])
def add_word_mapping(self, user_mapping: dict):
self.translation_dict.update(user_mapping)
# 查询单个字在不同地区的口音
def single_query(self, single_char):
if single_char not in self.vocab and single_char not in self.vocab_extension:
return None
if len(single_char) > 1:
return None
result_dict = {}
pinyin_list = self._pinyin_heteronym(single_char)
result_dict['府城'] = pinyin_list[0]
for k,v in self.accent_dict.items():
result_dict[v[1]] = self.convert_accent(pinyin_list, accent=k)[0]
return result_dict