File size: 2,754 Bytes
7b0a02f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

import re
import cn2an
from .syllable import *

num_map = {  
    '0': '零', '1': '【一 幺】', '2': '二', '3': '三', '4': '四', '5': '五', '6': '六', '7': '七', '8': '八', '9': '九'  
}



# 将数字直接转为汉字,适用于年份、电话号、身份证号等情况,15->一五、幺五 ; 110->一一零、幺幺零
def num_to_chinese(num):
    chinese_num = ""
    for ch in num:
        if ch in num_map.keys():
            chinese_num+=num_map[ch]
        else:
            chinese_num+=ch
    # chinese_num = HanziConv.toSimplified(str(num))
    return chinese_num

# 将数字智能转换,适用于表示数值的情况, 15->十五, 110->一百一十
def num_to_chinese_smart(num):
    return cn2an.transform(num,'an2cn')

# 在
def add_spaces_around_punctuation(text):
    punctuation = r"[,。?!、]"
    pattern = rf'(?<!\s)({punctuation})(?!\s)'

    replaced_text = re.sub(pattern, r' \1 ', text)

    return replaced_text

def load_dict(dict_path='./dict_data/vocab/origin_vocab.txt'):
    vocab = dict()
    with open(dict_path,'r',encoding='utf-8')as fr:
        for line in fr.readlines():
            if len(line.strip())==0:
                continue
            if line.strip().startswith('%'):
                continue
            left_item,right_item=line.strip().split('#',maxsplit=1)
            vocab[left_item]=right_item
    return vocab


# 整合jieba库分割的词汇,因为遇到#会分割。
def preprocess_generator(input_gen):
    result = []

    for item in input_gen:
        if item == '#':
            # 将#与前一个词合并
            if result:
                result[-1] += '#'
        elif item == ' ':
            # 忽略空字符串
            continue
        else:
            # 直接添加到结果中
            result.append(item)

    return result


def pinyin_to_phoneme(pinyin):
    tone = pinyin[-1]
    initial_final = pinyin[:-1]

    if initial_final in NASALS:
        return pinyin
    
    if initial_final[0] not in INITIALS:
        return pinyin

    split_idx=1
    if initial_final[:2] in ['bh','gh','ng']:
        split_idx=2

    initial = initial_final[:split_idx]
    final = initial_final[split_idx:]

    return "{} {}{}".format(initial,final,tone)

def pinyin_to_phoneme_list(pinyin):
    if len(pinyin)<=1:
        return pinyin
    
    tone = pinyin[-1]
    initial_final = pinyin[:-1]

    if initial_final in NASALS:
        return [initial_final, tone]
    
    if initial_final[0] not in INITIALS:
        return [initial_final, tone]

    split_idx=1
    if initial_final[:2] in ['bh','gh','ng']:
        split_idx=2

    initial = initial_final[:split_idx]
    final = initial_final[split_idx:]

    return [initial,final,tone]