| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| from typing import List |
| from typing import Tuple |
|
|
| import jieba_fast as jieba |
| from pypinyin import lazy_pinyin |
| from pypinyin import Style |
|
|
|
|
| class ToneSandhi: |
| def __init__(self): |
| self.must_neural_tone_words = { |
| "麻烦", |
| "麻利", |
| "鸳鸯", |
| "高粱", |
| "骨头", |
| "骆驼", |
| "马虎", |
| "首饰", |
| "馒头", |
| "馄饨", |
| "风筝", |
| "难为", |
| "队伍", |
| "阔气", |
| "闺女", |
| "门道", |
| "锄头", |
| "铺盖", |
| "铃铛", |
| "铁匠", |
| "钥匙", |
| "里脊", |
| "里头", |
| "部分", |
| "那么", |
| "道士", |
| "造化", |
| "迷糊", |
| "连累", |
| "这么", |
| "这个", |
| "运气", |
| "过去", |
| "软和", |
| "转悠", |
| "踏实", |
| "跳蚤", |
| "跟头", |
| "趔趄", |
| "财主", |
| "豆腐", |
| "讲究", |
| "记性", |
| "记号", |
| "认识", |
| "规矩", |
| "见识", |
| "裁缝", |
| "补丁", |
| "衣裳", |
| "衣服", |
| "衙门", |
| "街坊", |
| "行李", |
| "行当", |
| "蛤蟆", |
| "蘑菇", |
| "薄荷", |
| "葫芦", |
| "葡萄", |
| "萝卜", |
| "荸荠", |
| "苗条", |
| "苗头", |
| "苍蝇", |
| "芝麻", |
| "舒服", |
| "舒坦", |
| "舌头", |
| "自在", |
| "膏药", |
| "脾气", |
| "脑袋", |
| "脊梁", |
| "能耐", |
| "胳膊", |
| "胭脂", |
| "胡萝", |
| "胡琴", |
| "胡同", |
| "聪明", |
| "耽误", |
| "耽搁", |
| "耷拉", |
| "耳朵", |
| "老爷", |
| "老实", |
| "老婆", |
| "老头", |
| "老太", |
| "翻腾", |
| "罗嗦", |
| "罐头", |
| "编辑", |
| "结实", |
| "红火", |
| "累赘", |
| "糨糊", |
| "糊涂", |
| "精神", |
| "粮食", |
| "簸箕", |
| "篱笆", |
| "算计", |
| "算盘", |
| "答应", |
| "笤帚", |
| "笑语", |
| "笑话", |
| "窟窿", |
| "窝囊", |
| "窗户", |
| "稳当", |
| "稀罕", |
| "称呼", |
| "秧歌", |
| "秀气", |
| "秀才", |
| "福气", |
| "祖宗", |
| "砚台", |
| "码头", |
| "石榴", |
| "石头", |
| "石匠", |
| "知识", |
| "眼睛", |
| "眯缝", |
| "眨巴", |
| "眉毛", |
| "相声", |
| "盘算", |
| "白净", |
| "痢疾", |
| "痛快", |
| "疟疾", |
| "疙瘩", |
| "疏忽", |
| "畜生", |
| "生意", |
| "甘蔗", |
| "琵琶", |
| "琢磨", |
| "琉璃", |
| "玻璃", |
| "玫瑰", |
| "玄乎", |
| "狐狸", |
| "状元", |
| "特务", |
| "牲口", |
| "牙碜", |
| "牌楼", |
| "爽快", |
| "爱人", |
| "热闹", |
| "烧饼", |
| "烟筒", |
| "烂糊", |
| "点心", |
| "炊帚", |
| "灯笼", |
| "火候", |
| "漂亮", |
| "滑溜", |
| "溜达", |
| "温和", |
| "清楚", |
| "消息", |
| "浪头", |
| "活泼", |
| "比方", |
| "正经", |
| "欺负", |
| "模糊", |
| "槟榔", |
| "棺材", |
| "棒槌", |
| "棉花", |
| "核桃", |
| "栅栏", |
| "柴火", |
| "架势", |
| "枕头", |
| "枇杷", |
| "机灵", |
| "本事", |
| "木头", |
| "木匠", |
| "朋友", |
| "月饼", |
| "月亮", |
| "暖和", |
| "明白", |
| "时候", |
| "新鲜", |
| "故事", |
| "收拾", |
| "收成", |
| "提防", |
| "挖苦", |
| "挑剔", |
| "指甲", |
| "指头", |
| "拾掇", |
| "拳头", |
| "拨弄", |
| "招牌", |
| "招呼", |
| "抬举", |
| "护士", |
| "折腾", |
| "扫帚", |
| "打量", |
| "打算", |
| "打点", |
| "打扮", |
| "打听", |
| "打发", |
| "扎实", |
| "扁担", |
| "戒指", |
| "懒得", |
| "意识", |
| "意思", |
| "情形", |
| "悟性", |
| "怪物", |
| "思量", |
| "怎么", |
| "念头", |
| "念叨", |
| "快活", |
| "忙活", |
| "志气", |
| "心思", |
| "得罪", |
| "张罗", |
| "弟兄", |
| "开通", |
| "应酬", |
| "庄稼", |
| "干事", |
| "帮手", |
| "帐篷", |
| "希罕", |
| "师父", |
| "师傅", |
| "巴结", |
| "巴掌", |
| "差事", |
| "工夫", |
| "岁数", |
| "屁股", |
| "尾巴", |
| "少爷", |
| "小气", |
| "小伙", |
| "将就", |
| "对头", |
| "对付", |
| "寡妇", |
| "家伙", |
| "客气", |
| "实在", |
| "官司", |
| "学问", |
| "学生", |
| "字号", |
| "嫁妆", |
| "媳妇", |
| "媒人", |
| "婆家", |
| "娘家", |
| "委屈", |
| "姑娘", |
| "姐夫", |
| "妯娌", |
| "妥当", |
| "妖精", |
| "奴才", |
| "女婿", |
| "头发", |
| "太阳", |
| "大爷", |
| "大方", |
| "大意", |
| "大夫", |
| "多少", |
| "多么", |
| "外甥", |
| "壮实", |
| "地道", |
| "地方", |
| "在乎", |
| "困难", |
| "嘴巴", |
| "嘱咐", |
| "嘟囔", |
| "嘀咕", |
| "喜欢", |
| "喇嘛", |
| "喇叭", |
| "商量", |
| "唾沫", |
| "哑巴", |
| "哈欠", |
| "哆嗦", |
| "咳嗽", |
| "和尚", |
| "告诉", |
| "告示", |
| "含糊", |
| "吓唬", |
| "后头", |
| "名字", |
| "名堂", |
| "合同", |
| "吆喝", |
| "叫唤", |
| "口袋", |
| "厚道", |
| "厉害", |
| "千斤", |
| "包袱", |
| "包涵", |
| "匀称", |
| "勤快", |
| "动静", |
| "动弹", |
| "功夫", |
| "力气", |
| "前头", |
| "刺猬", |
| "刺激", |
| "别扭", |
| "利落", |
| "利索", |
| "利害", |
| "分析", |
| "出息", |
| "凑合", |
| "凉快", |
| "冷战", |
| "冤枉", |
| "冒失", |
| "养活", |
| "关系", |
| "先生", |
| "兄弟", |
| "便宜", |
| "使唤", |
| "佩服", |
| "作坊", |
| "体面", |
| "位置", |
| "似的", |
| "伙计", |
| "休息", |
| "什么", |
| "人家", |
| "亲戚", |
| "亲家", |
| "交情", |
| "云彩", |
| "事情", |
| "买卖", |
| "主意", |
| "丫头", |
| "丧气", |
| "两口", |
| "东西", |
| "东家", |
| "世故", |
| "不由", |
| "不在", |
| "下水", |
| "下巴", |
| "上头", |
| "上司", |
| "丈夫", |
| "丈人", |
| "一辈", |
| "那个", |
| "菩萨", |
| "父亲", |
| "母亲", |
| "咕噜", |
| "邋遢", |
| "费用", |
| "冤家", |
| "甜头", |
| "介绍", |
| "荒唐", |
| "大人", |
| "泥鳅", |
| "幸福", |
| "熟悉", |
| "计划", |
| "扑腾", |
| "蜡烛", |
| "姥爷", |
| "照顾", |
| "喉咙", |
| "吉他", |
| "弄堂", |
| "蚂蚱", |
| "凤凰", |
| "拖沓", |
| "寒碜", |
| "糟蹋", |
| "倒腾", |
| "报复", |
| "逻辑", |
| "盘缠", |
| "喽啰", |
| "牢骚", |
| "咖喱", |
| "扫把", |
| "惦记", |
| } |
| self.must_not_neural_tone_words = { |
| "男子", |
| "女子", |
| "分子", |
| "原子", |
| "量子", |
| "莲子", |
| "石子", |
| "瓜子", |
| "电子", |
| "人人", |
| "虎虎", |
| "幺幺", |
| "干嘛", |
| "学子", |
| "哈哈", |
| "数数", |
| "袅袅", |
| "局地", |
| "以下", |
| "娃哈哈", |
| "花花草草", |
| "留得", |
| "耕地", |
| "想想", |
| "熙熙", |
| "攘攘", |
| "卵子", |
| "死死", |
| "冉冉", |
| "恳恳", |
| "佼佼", |
| "吵吵", |
| "打打", |
| "考考", |
| "整整", |
| "莘莘", |
| "落地", |
| "算子", |
| "家家户户", |
| "青青", |
| } |
| self.punc = ":,;。?!“”‘’':,;.?!" |
|
|
| |
| |
| |
| |
| |
| def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: |
| |
| for j, item in enumerate(word): |
| if ( |
| j - 1 >= 0 |
| and item == word[j - 1] |
| and pos[0] in {"n", "v", "a"} |
| and word not in self.must_not_neural_tone_words |
| ): |
| finals[j] = finals[j][:-1] + "5" |
| ge_idx = word.find("个") |
| if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": |
| finals[-1] = finals[-1][:-1] + "5" |
| elif len(word) >= 1 and word[-1] in "的地得": |
| finals[-1] = finals[-1][:-1] + "5" |
| |
| elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: |
| finals[-1] = finals[-1][:-1] + "5" |
| elif ( |
| len(word) > 1 |
| and word[-1] in "们子" |
| and pos in {"r", "n"} |
| and word not in self.must_not_neural_tone_words |
| ): |
| finals[-1] = finals[-1][:-1] + "5" |
| |
| elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: |
| finals[-1] = finals[-1][:-1] + "5" |
| |
| elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": |
| finals[-1] = finals[-1][:-1] + "5" |
| |
| elif ( |
| ge_idx >= 1 |
| and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") |
| ) or word == "个": |
| finals[ge_idx] = finals[ge_idx][:-1] + "5" |
| else: |
| if ( |
| word in self.must_neural_tone_words |
| or word[-2:] in self.must_neural_tone_words |
| ): |
| finals[-1] = finals[-1][:-1] + "5" |
|
|
| word_list = self._split_word(word) |
| finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] |
| for i, word in enumerate(word_list): |
| |
| if ( |
| word in self.must_neural_tone_words |
| or word[-2:] in self.must_neural_tone_words |
| ): |
| finals_list[i][-1] = finals_list[i][-1][:-1] + "5" |
| finals = sum(finals_list, []) |
| return finals |
|
|
| def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]: |
| |
| if len(word) == 3 and word[1] == "不": |
| finals[1] = finals[1][:-1] + "5" |
| else: |
| for i, char in enumerate(word): |
| |
| if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4": |
| finals[i] = finals[i][:-1] + "2" |
| return finals |
|
|
| def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: |
| |
| if word.find("一") != -1 and all( |
| [item.isnumeric() for item in word if item != "一"] |
| ): |
| return finals |
| |
| elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: |
| finals[1] = finals[1][:-1] + "5" |
| |
| elif word.startswith("第一"): |
| finals[1] = finals[1][:-1] + "1" |
| else: |
| for i, char in enumerate(word): |
| if char == "一" and i + 1 < len(word): |
| |
| if finals[i + 1][-1] == "4": |
| finals[i] = finals[i][:-1] + "2" |
| |
| else: |
| |
| if word[i + 1] not in self.punc: |
| finals[i] = finals[i][:-1] + "4" |
| return finals |
|
|
| def _split_word(self, word: str) -> List[str]: |
| word_list = jieba.cut_for_search(word) |
| word_list = sorted(word_list, key=lambda i: len(i), reverse=False) |
| first_subword = word_list[0] |
| first_begin_idx = word.find(first_subword) |
| if first_begin_idx == 0: |
| second_subword = word[len(first_subword) :] |
| new_word_list = [first_subword, second_subword] |
| else: |
| second_subword = word[: -len(first_subword)] |
| new_word_list = [second_subword, first_subword] |
| return new_word_list |
|
|
| def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: |
| if len(word) == 2 and self._all_tone_three(finals): |
| finals[0] = finals[0][:-1] + "2" |
| elif len(word) == 3: |
| word_list = self._split_word(word) |
| if self._all_tone_three(finals): |
| |
| if len(word_list[0]) == 2: |
| finals[0] = finals[0][:-1] + "2" |
| finals[1] = finals[1][:-1] + "2" |
| |
| elif len(word_list[0]) == 1: |
| finals[1] = finals[1][:-1] + "2" |
| else: |
| finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] |
| if len(finals_list) == 2: |
| for i, sub in enumerate(finals_list): |
| |
| if self._all_tone_three(sub) and len(sub) == 2: |
| finals_list[i][0] = finals_list[i][0][:-1] + "2" |
| |
| elif ( |
| i == 1 |
| and not self._all_tone_three(sub) |
| and finals_list[i][0][-1] == "3" |
| and finals_list[0][-1][-1] == "3" |
| ): |
| finals_list[0][-1] = finals_list[0][-1][:-1] + "2" |
| finals = sum(finals_list, []) |
| |
| elif len(word) == 4: |
| finals_list = [finals[:2], finals[2:]] |
| finals = [] |
| for sub in finals_list: |
| if self._all_tone_three(sub): |
| sub[0] = sub[0][:-1] + "2" |
| finals += sub |
|
|
| return finals |
|
|
| def _all_tone_three(self, finals: List[str]) -> bool: |
| return all(x[-1] == "3" for x in finals) |
|
|
| |
| |
| def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
| new_seg = [] |
| last_word = "" |
| for word, pos in seg: |
| if last_word == "不": |
| word = last_word + word |
| if word != "不": |
| new_seg.append((word, pos)) |
| last_word = word[:] |
| if last_word == "不": |
| new_seg.append((last_word, "d")) |
| last_word = "" |
| return new_seg |
|
|
| |
| |
| |
| |
| |
| |
| def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
| new_seg = [] |
| |
| for i, (word, pos) in enumerate(seg): |
| if ( |
| i - 1 >= 0 |
| and word == "一" |
| and i + 1 < len(seg) |
| and seg[i - 1][0] == seg[i + 1][0] |
| and seg[i - 1][1] == "v" |
| and seg[i + 1][1] == "v" |
| ): |
| new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] |
| else: |
| if ( |
| i - 2 >= 0 |
| and seg[i - 1][0] == "一" |
| and seg[i - 2][0] == word |
| and pos == "v" |
| ): |
| continue |
| else: |
| new_seg.append([word, pos]) |
| seg = new_seg |
| new_seg = [] |
| |
| for i, (word, pos) in enumerate(seg): |
| if new_seg and new_seg[-1][0] == "一": |
| new_seg[-1][0] = new_seg[-1][0] + word |
| else: |
| new_seg.append([word, pos]) |
| return new_seg |
|
|
| |
| def _merge_continuous_three_tones( |
| self, seg: List[Tuple[str, str]] |
| ) -> List[Tuple[str, str]]: |
| new_seg = [] |
| sub_finals_list = [ |
| lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) |
| for (word, pos) in seg |
| ] |
| assert len(sub_finals_list) == len(seg) |
| merge_last = [False] * len(seg) |
| for i, (word, pos) in enumerate(seg): |
| if ( |
| i - 1 >= 0 |
| and self._all_tone_three(sub_finals_list[i - 1]) |
| and self._all_tone_three(sub_finals_list[i]) |
| and not merge_last[i - 1] |
| ): |
| |
| if ( |
| not self._is_reduplication(seg[i - 1][0]) |
| and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 |
| ): |
| new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
| merge_last[i] = True |
| else: |
| new_seg.append([word, pos]) |
| else: |
| new_seg.append([word, pos]) |
|
|
| return new_seg |
|
|
| def _is_reduplication(self, word: str) -> bool: |
| return len(word) == 2 and word[0] == word[1] |
|
|
| |
| def _merge_continuous_three_tones_2( |
| self, seg: List[Tuple[str, str]] |
| ) -> List[Tuple[str, str]]: |
| new_seg = [] |
| sub_finals_list = [ |
| lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) |
| for (word, pos) in seg |
| ] |
| assert len(sub_finals_list) == len(seg) |
| merge_last = [False] * len(seg) |
| for i, (word, pos) in enumerate(seg): |
| if ( |
| i - 1 >= 0 |
| and sub_finals_list[i - 1][-1][-1] == "3" |
| and sub_finals_list[i][0][-1] == "3" |
| and not merge_last[i - 1] |
| ): |
| |
| if ( |
| not self._is_reduplication(seg[i - 1][0]) |
| and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 |
| ): |
| new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
| merge_last[i] = True |
| else: |
| new_seg.append([word, pos]) |
| else: |
| new_seg.append([word, pos]) |
| return new_seg |
|
|
| def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
| new_seg = [] |
| for i, (word, pos) in enumerate(seg): |
| if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#": |
| new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
| else: |
| new_seg.append([word, pos]) |
| return new_seg |
|
|
| def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
| new_seg = [] |
| for i, (word, pos) in enumerate(seg): |
| if new_seg and word == new_seg[-1][0]: |
| new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
| else: |
| new_seg.append([word, pos]) |
| return new_seg |
|
|
| def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
| seg = self._merge_bu(seg) |
| try: |
| seg = self._merge_yi(seg) |
| except: |
| print("_merge_yi failed") |
| seg = self._merge_reduplication(seg) |
| try: |
| seg = self._merge_continuous_three_tones(seg) |
| except: |
| print("_merge_continuous_three_tones failed") |
| try: |
| seg = self._merge_continuous_three_tones_2(seg) |
| except: |
| print("_merge_continuous_three_tones_2 failed") |
|
|
| seg = self._merge_er(seg) |
| return seg |
|
|
| def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: |
| finals = self._bu_sandhi(word, finals) |
| finals = self._yi_sandhi(word, finals) |
| finals = self._neural_sandhi(word, pos, finals) |
| finals = self._three_sandhi(word, finals) |
| return finals |
|
|