Spaces:
Running
Running
| from . import utils | |
| import jieba | |
| import types | |
| import yaml | |
| import os | |
| jieba.load_userdict('./dict_data/word_dict/jieba_cut.txt') | |
| class pyPengIm(): | |
| def __init__(self, history=False) -> None: | |
| self._dict_paths = { | |
| "vocab": "./dict_data/vocab/origin_vocab.txt", | |
| "vocab_extension": "./dict_data/vocab/vocab_extension.txt", | |
| "word_dict": "./dict_data/word_dict/dict.txt", | |
| "teochew_word_dict": "./dict_data/word_dict/teochew_local_dict.txt", | |
| "translation_dict": "./dict_data/word_dict/madr_to_tch.txt", | |
| "surname_dict": "./dict_data/vocab/Surname.txt", | |
| "IPA_dict": "./dict_data/vocab/IPA_lexicon.txt", | |
| # "phoneme_dict": "./dict_data/vocab/phone.txt", | |
| "low_fre_dict": "./dict_data/vocab/low_fre.txt" | |
| } | |
| self.accent_dict = self._load_accent() | |
| self._loaded_dicts = {} | |
| # 是否启用中国历史词典,以支持古代年号、政权、官职、人名、民族等 | |
| if history: | |
| self.word_dict.update(utils.load_dict("./dict_data/word_dict/history.txt")) | |
| self.word_dict.update(utils.load_dict("./dict_data/word_dict/reign_title.txt")) | |
| jieba.cut('')# 预热 | |
| def __getattr__(self, name): | |
| if name in self._dict_paths: | |
| if name not in self._loaded_dicts: | |
| self._loaded_dicts[name] = utils.load_dict(self._dict_paths[name]) | |
| return self._loaded_dicts[name] | |
| raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") | |
| def _load_accent(self,accent_config_path="./dict_data/accent_convert/accent.yaml"): | |
| with open(accent_config_path, 'r', encoding='utf-8') as file: | |
| accent_config = yaml.safe_load(file) | |
| accent_dict = {} | |
| for k,v in accent_config.items(): | |
| accent_dict[k] = ( | |
| utils.load_dict(os.path.join("./dict_data/accent_convert",v['path'])), | |
| v['name'] | |
| ) | |
| return accent_dict | |
| def pinyin(self, text, heteronym=False, accent='', auto_split=True): | |
| text = text.upper() | |
| if heteronym: | |
| pinyin_list = self._pinyin_heteronym(text) | |
| else: | |
| if auto_split: | |
| pinyin_list = self.pinyin_optimize(utils.preprocess_generator(self.sentence_cut(text))) | |
| else: | |
| pinyin_list = self.pinyin_optimize(text.split(' ')) | |
| if accent in self.accent_dict: | |
| pinyin_list = self.convert_accent(pinyin_list, accent) | |
| surname_list = self._surname_notice(text) | |
| return { | |
| 'result': pinyin_list, | |
| 'pinyin_seq': self._to_pinyin_sequence(pinyin_list), | |
| 'surname_notice': surname_list | |
| } | |
| def sentence_cut(self, text): | |
| new_text_list = [] | |
| for ch in text: | |
| if ch not in self.vocab and ch not in self.vocab_extension: | |
| new_text_list.append(' {} '.format(ch)) | |
| else: | |
| new_text_list.append(ch) | |
| return jieba.cut("".join(new_text_list)) | |
| def _to_pinyin_sequence(self, pinyin_list): | |
| result = [] | |
| for item in pinyin_list: | |
| for pinyin in item[1:]: | |
| if pinyin != ['None']: | |
| result.append('|'.join([py.replace('*', '') for py in pinyin])) | |
| else: | |
| result.append(item[0]) ## 非法字符,原样输出 | |
| return ' '.join(result) | |
| def _surname_notice(self, text): | |
| result = [] | |
| for ch in text: | |
| if ch in self.surname_dict.keys(): | |
| item = [ch, self._to_pinyin_list(self.surname_dict[ch])] | |
| result.append(item) | |
| return result | |
| def _pinyin_heteronym(self, text): | |
| result = [] | |
| text = text.replace('#', '') | |
| for zh_char in text: | |
| item = [] | |
| if zh_char in self.vocab.keys(): | |
| item.extend(self._to_pinyin_list(self.vocab[zh_char])) | |
| if zh_char in self.vocab_extension.keys(): | |
| item.extend(self._to_pinyin_list(self.vocab_extension[zh_char])) | |
| if zh_char in self.low_fre_dict.keys(): | |
| item.extend(self._to_pinyin_list(self.low_fre_dict[zh_char])) | |
| result.append([zh_char, item]) | |
| return result | |
| def _to_pinyin_list(self, pinyin_item): | |
| return pinyin_item.split('|') if '|' in pinyin_item else [pinyin_item] | |
| def _word_to_pinyin(self, item_word): | |
| if ' ' in item_word: | |
| return [self._to_pinyin_list(pinyin) for pinyin in item_word.split(' ')] | |
| return [[item_word]] | |
| def pinyin_optimize(self, word_list): | |
| result = [] | |
| for word in word_list: | |
| if not word.strip(): | |
| continue | |
| word_translate_flag = word.endswith('#') | |
| word = word.rstrip('#') | |
| word_found_flag = False | |
| item = [word] | |
| if word in self.teochew_word_dict.keys(): | |
| item.extend(self._word_to_pinyin(self.teochew_word_dict[word])) | |
| word_found_flag = True | |
| if not word_translate_flag and word in self.word_dict: | |
| item = [word] | |
| item.extend(self._word_to_pinyin(self.word_dict[word])) # 在非翻译模式下,清空之前的拼音,仅保留 word_dict 结果,也就是只保存普通话语义 | |
| word_found_flag = True | |
| elif not word_translate_flag or not word_found_flag: | |
| if word in self.word_dict: | |
| item.extend(self._word_to_pinyin(self.word_dict[word])) | |
| word_found_flag = True | |
| if not word_found_flag: | |
| for zh_char in word: | |
| if zh_char in self.vocab.keys(): | |
| item.append(self._to_pinyin_list(self.vocab[zh_char])) | |
| elif zh_char in self.vocab_extension.keys(): | |
| item.append(self._to_pinyin_list(self.vocab_extension[zh_char])) | |
| else: | |
| item.append(self._to_pinyin_list('None')) | |
| result.append(item) | |
| return result | |
| def convert_accent(self, pinyin_list, accent): | |
| target_vocab = self.accent_dict[accent][0] | |
| result = [] | |
| for one_pair in pinyin_list: | |
| word, pinyins = one_pair[0], list(one_pair[1:]) | |
| item = [word] | |
| for i, hanzi in enumerate(word): | |
| pronunciations = [] | |
| for pronunciation in pinyins[i]: | |
| query_item = f'{hanzi}_{pronunciation}' | |
| target_accent = target_vocab.get(query_item, pronunciation) | |
| pronunciations.extend(self._to_pinyin_list(target_accent)) | |
| item.append(list(dict.fromkeys(pronunciations)))# 去重 | |
| result.append(item) | |
| return result | |
| def to_IPA(self, pinyin_seq, blank=True): | |
| if blank: | |
| split_char = ' ' | |
| else: | |
| split_char = '' | |
| result = [] | |
| for pinyin in pinyin_seq.split(' '): | |
| if '|' in pinyin: | |
| ipa_item = [] | |
| for py in pinyin.split('|'): | |
| ph_list = utils.pinyin_to_phoneme_list(py) | |
| ipa_item.append(split_char.join([self.IPA_dict[ph] if ph in self.IPA_dict else ph for ph in ph_list])) | |
| result.append("|".join(ipa_item)) | |
| else: | |
| ph_list = utils.pinyin_to_phoneme_list(pinyin) | |
| result.append(split_char.join(self.IPA_dict[ph] if ph in self.IPA_dict else ph for ph in ph_list)) | |
| return result | |
| def to_phoneme(self, pinyin_seq): | |
| return ['|'.join([utils.pinyin_to_phoneme(py) for py in pinyin.split('|')]) if '|' in pinyin else utils.pinyin_to_phoneme(pinyin) for pinyin in pinyin_seq.split(' ')] | |
| def to_oral(self, text, auto_split=True): | |
| if isinstance(text, list) or isinstance(text, types.GeneratorType): | |
| word_list = text | |
| elif isinstance(text, str): | |
| word_list = jieba.cut(text) if auto_split else text.split(' ') | |
| else: | |
| return None | |
| return ' '.join([self.translation_dict.get(word, word) + '#' if word in self.translation_dict else word for word in word_list]) | |
| def add_word_mapping(self, user_mapping: dict): | |
| self.translation_dict.update(user_mapping) | |
| # 查询单个字在不同地区的口音 | |
| def single_query(self, single_char): | |
| if single_char not in self.vocab and single_char not in self.vocab_extension: | |
| return None | |
| if len(single_char) > 1: | |
| return None | |
| result_dict = {} | |
| pinyin_list = self._pinyin_heteronym(single_char) | |
| result_dict['府城'] = pinyin_list[0] | |
| for k,v in self.accent_dict.items(): | |
| result_dict[v[1]] = self.convert_accent(pinyin_list, accent=k)[0] | |
| return result_dict |