File size: 10,547 Bytes
0c354cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
import os
import re
import time
start = time.time()
import cn2an
print(f"import cn2an take {time.time() - start}s")
start = time.time()
from pypinyin import lazy_pinyin, Style
print(f"import pypinyin take {time.time() - start}s")
# from text.symbols import punctuation
start = time.time()
from .symbols import language_tone_start_map
print(f"import symbols take {time.time() - start}s")
start = time.time()
from .tone_sandhi import ToneSandhi
print(f"import tone_sandhi take {time.time() - start}s")
start = time.time()
from .english import g2p as g2p_en
print(f"import english take {time.time() - start}s")
start = time.time()
# from transformers import AutoTokenizer
from .fast_tokenizer import FastTokenizer
print(f"import AutoTokenizer take {time.time() - start}s")
punctuation = ["!", "?", "…", ",", ".", "'", "-"]
current_file_path = os.path.dirname(__file__)
start = time.time()
pinyin_to_symbol_map = {
line.split("\t")[0]: line.strip().split("\t")[1]
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
}
print(f"pinyin_to_symbol_map take {time.time() - start}s")
rep_map = {
":": ",",
";": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
"·": ",",
"、": ",",
"...": "…",
"$": ".",
"“": "'",
"”": "'",
"‘": "'",
"’": "'",
"(": "'",
")": "'",
"(": "'",
")": "'",
"《": "'",
"》": "'",
"【": "'",
"】": "'",
"[": "'",
"]": "'",
"—": "-",
"~": "-",
"~": "-",
"「": "'",
"」": "'",
}
start = time.time()
tone_modifier = ToneSandhi()
print(f"tone_modifier take {time.time() - start}s")
def replace_punctuation(text):
text = text.replace("嗯", "恩").replace("呣", "母")
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(r"[^\u4e00-\u9fa5_a-zA-Z\s" + "".join(punctuation) + r"]+", "", replaced_text)
replaced_text = re.sub(r"[\s]+", " ", replaced_text)
return replaced_text
def g2p(text, impl='v2'):
pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
if impl == 'v1':
_func = _g2p
elif impl == 'v2':
_func = _g2p_v2
else:
raise NotImplementedError()
phones, tones, word2ph = _func(sentences)
assert sum(word2ph) == len(phones)
# assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
phones = ["_"] + phones + ["_"]
tones = [0] + tones + [0]
word2ph = [1] + word2ph + [1]
return phones, tones, word2ph
def _get_initials_finals(word):
initials = []
finals = []
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
orig_finals = lazy_pinyin(
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
)
for c, v in zip(orig_initials, orig_finals):
initials.append(c)
finals.append(v)
return initials, finals
start = time.time()
model_id = 'bert-base-multilingual-uncased'
model_cache_path = os.path.join(current_file_path, model_id)
tokenizer = FastTokenizer(f"{model_cache_path}/tokenizer.json")
print(f"Load tokenizer take {time.time() - start}s")
# if not os.path.exists(model_cache_path):
# print(f"{model_id} not exist, will download...")
# tokenizer = AutoTokenizer.from_pretrained(model_id,
# use_fast=True, # 启用快速实现(基于 Rust)
# device_map="auto" # 允许按需加载部分数据
# ).save_pretrained(model_cache_path)
# else:
# print("Load tokenizer Hit cache")
# start = time.time()
# tokenizer = AutoTokenizer.from_pretrained(model_cache_path, # 手动下载后指定路径
# local_files_only=True,
# use_fast=True, # 启用快速实现(基于 Rust)
# device_map="auto" # 允许按需加载部分数据
# )
# print(f"Load tokenizer take {time.time() - start}s")
def _g2p(segments):
# start = time.time()
import jieba.posseg as psg
# print(f"import jieba take {time.time() - start}s")
phones_list = []
tones_list = []
word2ph = []
for seg in segments:
# Replace all English words in the sentence
# seg = re.sub("[a-zA-Z]+", "", seg)
seg_cut = psg.lcut(seg)
initials = []
finals = []
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
for word, pos in seg_cut:
if pos == "eng":
initials.append(['EN_WORD'])
finals.append([word])
else:
sub_initials, sub_finals = _get_initials_finals(word)
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
initials = sum(initials, [])
finals = sum(finals, [])
#
for c, v in zip(initials, finals):
if c == 'EN_WORD':
tokenized_en = tokenizer.tokenize(v)
phones_en, tones_en, word2ph_en = g2p_en(text=None, pad_start_end=False, tokenized=tokenized_en)
# apply offset to tones_en
tones_en = [t + language_tone_start_map['EN'] for t in tones_en]
phones_list += phones_en
tones_list += tones_en
word2ph += word2ph_en
else:
raw_pinyin = c + v
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if c == v:
assert c in punctuation
phone = [c]
tone = "0"
word2ph.append(1)
else:
v_without_tone = v[:-1]
tone = v[-1]
pinyin = c + v_without_tone
assert tone in "12345"
if c:
# 多音节
v_rep_map = {
"uei": "ui",
"iou": "iu",
"uen": "un",
}
if v_without_tone in v_rep_map.keys():
pinyin = c + v_rep_map[v_without_tone]
else:
# 单音节
pinyin_rep_map = {
"ing": "ying",
"i": "yi",
"in": "yin",
"u": "wu",
}
if pinyin in pinyin_rep_map.keys():
pinyin = pinyin_rep_map[pinyin]
else:
single_rep_map = {
"v": "yu",
"e": "e",
"i": "y",
"u": "w",
}
if pinyin[0] in single_rep_map.keys():
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
phone = pinyin_to_symbol_map[pinyin].split(" ")
word2ph.append(len(phone))
phones_list += phone
tones_list += [int(tone)] * len(phone)
return phones_list, tones_list, word2ph
def text_normalize(text):
numbers = re.findall(r"\d+(?:\.?\d+)?", text)
for number in numbers:
text = text.replace(number, cn2an.an2cn(number), 1)
text = replace_punctuation(text)
return text
def get_bert_feature(text, word2ph, device):
from . import chinese_bert
return chinese_bert.get_bert_feature(text, word2ph, model_id='bert-base-multilingual-uncased', device=device)
start = time.time()
from .chinese import _g2p as _chinese_g2p
print(f"import chinese g2p take {time.time() - start}s")
def _g2p_v2(segments):
spliter = '#$&^!@'
phones_list = []
tones_list = []
word2ph = []
for text in segments:
assert spliter not in text
# replace all english words
text = re.sub('([a-zA-Z\s]+)', lambda x: f'{spliter}{x.group(1)}{spliter}', text)
texts = text.split(spliter)
texts = [t for t in texts if len(t) > 0]
for text in texts:
if re.match('[a-zA-Z\s]+', text):
# english
tokenized_en = tokenizer.tokenize(text)
phones_en, tones_en, word2ph_en = g2p_en(text=None, pad_start_end=False, tokenized=tokenized_en)
# apply offset to tones_en
tones_en = [t + language_tone_start_map['EN'] for t in tones_en]
phones_list += phones_en
tones_list += tones_en
word2ph += word2ph_en
else:
phones_zh, tones_zh, word2ph_zh = _chinese_g2p([text])
phones_list += phones_zh
tones_list += tones_zh
word2ph += word2ph_zh
return phones_list, tones_list, word2ph
if __name__ == "__main__":
# from text.chinese_bert import get_bert_feature
text = "NFT啊!chemistry 但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
text = '我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。'
text = '今天下午,我们准备去shopping mall购物,然后晚上去看一场movie。'
text = '我们现在 also 能够 help 很多公司 use some machine learning 的 algorithms 啊!'
text = text_normalize(text)
print(text)
phones, tones, word2ph = g2p(text, impl='v2')
bert = get_bert_feature(text, word2ph, device='cuda:0')
print(phones)
import pdb; pdb.set_trace()
# # 示例用法
# text = "这是一个示例文本:,你好!这是一个测试...."
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|