biosn2 commited on
Commit
5a7300e
·
verified ·
1 Parent(s): 8c2f991

Upload indextts/utils/front.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. indextts/utils/front.py +537 -0
indextts/utils/front.py ADDED
@@ -0,0 +1,537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import os
3
+ import traceback
4
+ import re
5
+ from typing import List, Union, overload
6
+ import warnings
7
+ from indextts.utils.common import tokenize_by_CJK_char, de_tokenized_by_CJK_char
8
+ from sentencepiece import SentencePieceProcessor
9
+
10
+
11
+ class TextNormalizer:
12
+ def __init__(self):
13
+ self.zh_normalizer = None
14
+ self.en_normalizer = None
15
+ self.char_rep_map = {
16
+ ":": ",",
17
+ ";": ",",
18
+ ";": ",",
19
+ ",": ",",
20
+ "。": ".",
21
+ "!": "!",
22
+ "?": "?",
23
+ "\n": " ",
24
+ "·": "-",
25
+ "、": ",",
26
+ "...": "…",
27
+ ",,,": "…",
28
+ ",,,": "…",
29
+ "……": "…",
30
+ "“": "'",
31
+ "”": "'",
32
+ '"': "'",
33
+ "‘": "'",
34
+ "’": "'",
35
+ "(": "'",
36
+ ")": "'",
37
+ "(": "'",
38
+ ")": "'",
39
+ "《": "'",
40
+ "》": "'",
41
+ "【": "'",
42
+ "】": "'",
43
+ "[": "'",
44
+ "]": "'",
45
+ "—": "-",
46
+ "~": "-",
47
+ "~": "-",
48
+ "「": "'",
49
+ "」": "'",
50
+ ":": ",",
51
+ }
52
+ self.zh_char_rep_map = {
53
+ "$": ".",
54
+ **self.char_rep_map,
55
+ }
56
+
57
+ def match_email(self, email):
58
+ # 正则表达式匹配邮箱格式:数字英文@数字英文.英文
59
+ pattern = r"^[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+$"
60
+ return re.match(pattern, email) is not None
61
+
62
+ PINYIN_TONE_PATTERN = r"(?<![a-z])((?:[bpmfdtnlgkhjqxzcsryw]|[zcs]h)?(?:[aeiouüv]|[ae]i|u[aio]|ao|ou|i[aue]|[uüv]e|[uvü]ang?|uai|[aeiuv]n|[aeio]ng|ia[no]|i[ao]ng)|ng|er)([1-5])"
63
+ """
64
+ 匹配拼音声调格式:pinyin+数字,声调1-5,5表示轻声
65
+ 例如:xuan4, jve2, ying1, zhong4, shang5
66
+ 不匹配:beta1, voice2
67
+ """
68
+ NAME_PATTERN = r"[\u4e00-\u9fff]+(?:[-·—][\u4e00-\u9fff]+){1,2}"
69
+ """
70
+ 匹配人名,格式:中文·中文,中文·中文-中文
71
+ 例如:克里斯托弗·诺兰,约瑟夫·高登-莱维特
72
+ """
73
+
74
+ # 匹配常见英语缩写 's,仅用于替换为 is,不匹配所有 's
75
+ ENGLISH_CONTRACTION_PATTERN = r"(what|where|who|which|how|t?here|it|s?he|that|this)'s"
76
+
77
+
78
+ def use_chinese(self, s):
79
+ has_chinese = bool(re.search(r"[\u4e00-\u9fff]", s))
80
+ has_alpha = bool(re.search(r"[a-zA-Z]", s))
81
+ is_email = self.match_email(s)
82
+ if has_chinese or not has_alpha or is_email:
83
+ return True
84
+
85
+ has_pinyin = bool(re.search(TextNormalizer.PINYIN_TONE_PATTERN, s, re.IGNORECASE))
86
+ return has_pinyin
87
+
88
+ def load(self):
89
+ # print(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
90
+ # sys.path.append(model_dir)
91
+ import platform
92
+ if self.zh_normalizer is not None and self.en_normalizer is not None:
93
+ return
94
+ if platform.system() == "Darwin":
95
+ from wetext import Normalizer
96
+
97
+ self.zh_normalizer = Normalizer(remove_erhua=False, lang="zh", operator="tn")
98
+ self.en_normalizer = Normalizer(lang="en", operator="tn")
99
+ else:
100
+ from tn.chinese.normalizer import Normalizer as NormalizerZh
101
+ from tn.english.normalizer import Normalizer as NormalizerEn
102
+ # use new cache dir for build tagger rules with disable remove_interjections and remove_erhua
103
+ cache_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tagger_cache")
104
+ if not os.path.exists(cache_dir):
105
+ os.makedirs(cache_dir)
106
+ with open(os.path.join(cache_dir, ".gitignore"), "w") as f:
107
+ f.write("*\n")
108
+ self.zh_normalizer = NormalizerZh(
109
+ cache_dir=cache_dir, remove_interjections=False, remove_erhua=False, overwrite_cache=False
110
+ )
111
+ self.en_normalizer = NormalizerEn(overwrite_cache=False)
112
+
113
+ def normalize(self, text: str) -> str:
114
+ text = text.replace("嗯", "恩").replace("呣", "母")
115
+ if not self.zh_normalizer or not self.en_normalizer:
116
+ print("Error, text normalizer is not initialized !!!")
117
+ return ""
118
+ if self.use_chinese(text):
119
+ text = re.sub(TextNormalizer.ENGLISH_CONTRACTION_PATTERN, r"\1 is", text, flags=re.IGNORECASE)
120
+ replaced_text, pinyin_list = self.save_pinyin_tones(text.rstrip())
121
+
122
+ replaced_text, original_name_list = self.save_names(replaced_text)
123
+ try:
124
+ result = self.zh_normalizer.normalize(replaced_text)
125
+ except Exception:
126
+ result = ""
127
+ print(traceback.format_exc())
128
+ # 恢复人名
129
+ result = self.restore_names(result, original_name_list)
130
+ # 恢复拼音声调
131
+ result = self.restore_pinyin_tones(result, pinyin_list)
132
+ pattern = re.compile("|".join(re.escape(p) for p in self.zh_char_rep_map.keys()))
133
+ result = pattern.sub(lambda x: self.zh_char_rep_map[x.group()], result)
134
+ else:
135
+ try:
136
+ text = re.sub(TextNormalizer.ENGLISH_CONTRACTION_PATTERN, r"\1 is", text, flags=re.IGNORECASE)
137
+ result = self.en_normalizer.normalize(text)
138
+ except Exception:
139
+ result = text
140
+ print(traceback.format_exc())
141
+ pattern = re.compile("|".join(re.escape(p) for p in self.char_rep_map.keys()))
142
+ result = pattern.sub(lambda x: self.char_rep_map[x.group()], result)
143
+ return result
144
+
145
+ def correct_pinyin(self, pinyin: str):
146
+ """
147
+ 将 jqx 的韵母为 u/ü 的拼音转换为 v
148
+ 如:ju -> jv , que -> qve, xün -> xvn
149
+ """
150
+ if pinyin[0] not in "jqxJQX":
151
+ return pinyin
152
+ # 匹配 jqx 的韵母为 u/ü 的拼音
153
+ pattern = r"([jqx])[uü](n|e|an)*(\d)"
154
+ repl = r"\g<1>v\g<2>\g<3>"
155
+ pinyin = re.sub(pattern, repl, pinyin, flags=re.IGNORECASE)
156
+ return pinyin.upper()
157
+
158
+ def save_names(self, original_text):
159
+ """
160
+ 替换人名为占位符 <n_a>、 <n_b>, ...
161
+ 例如:克里斯托弗·诺兰 -> <n_a>
162
+ """
163
+ # 人名
164
+ name_pattern = re.compile(TextNormalizer.NAME_PATTERN, re.IGNORECASE)
165
+ original_name_list = re.findall(name_pattern, original_text)
166
+ if len(original_name_list) == 0:
167
+ return (original_text, None)
168
+ original_name_list = list(set("".join(n) for n in original_name_list))
169
+ transformed_text = original_text
170
+ # 替换占位符 <n_a>、 <n_b>, ...
171
+ for i, name in enumerate(original_name_list):
172
+ number = chr(ord("a") + i)
173
+ transformed_text = transformed_text.replace(name, f"<n_{number}>")
174
+
175
+ return transformed_text, original_name_list
176
+
177
+ def restore_names(self, normalized_text, original_name_list):
178
+ """
179
+ 恢复人名为原来的文字
180
+ 例如:<n_a> -> original_name_list[0]
181
+ """
182
+ if not original_name_list or len(original_name_list) == 0:
183
+ return normalized_text
184
+
185
+ transformed_text = normalized_text
186
+ # 替换为占位符 <n_a>、 <n_b>, ...
187
+ for i, name in enumerate(original_name_list):
188
+ number = chr(ord("a") + i)
189
+ transformed_text = transformed_text.replace(f"<n_{number}>", name)
190
+ return transformed_text
191
+
192
+ def save_pinyin_tones(self, original_text):
193
+ """
194
+ 替换拼音声调为占位符 <pinyin_a>, <pinyin_b>, ...
195
+ 例如:xuan4 -> <pinyin_a>
196
+ """
197
+ # 声母韵母+声调数字
198
+ origin_pinyin_pattern = re.compile(TextNormalizer.PINYIN_TONE_PATTERN, re.IGNORECASE)
199
+ original_pinyin_list = re.findall(origin_pinyin_pattern, original_text)
200
+ if len(original_pinyin_list) == 0:
201
+ return (original_text, None)
202
+ original_pinyin_list = list(set("".join(p) for p in original_pinyin_list))
203
+ transformed_text = original_text
204
+ # 替换为占位符 <pinyin_a>, <pinyin_b>, ...
205
+ for i, pinyin in enumerate(original_pinyin_list):
206
+ number = chr(ord("a") + i)
207
+ transformed_text = transformed_text.replace(pinyin, f"<pinyin_{number}>")
208
+
209
+ # print("original_text: ", original_text)
210
+ # print("transformed_text: ", transformed_text)
211
+ return transformed_text, original_pinyin_list
212
+
213
+ def restore_pinyin_tones(self, normalized_text, original_pinyin_list):
214
+ """
215
+ 恢复拼音中的音调数字(1-5)为原来的拼音
216
+ 例如:<pinyin_a> -> original_pinyin_list[0]
217
+ """
218
+ if not original_pinyin_list or len(original_pinyin_list) == 0:
219
+ return normalized_text
220
+
221
+ transformed_text = normalized_text
222
+ # 替换占位符 <pinyin_a>, <pinyin_b>, ...
223
+ for i, pinyin in enumerate(original_pinyin_list):
224
+ number = chr(ord("a") + i)
225
+ pinyin = self.correct_pinyin(pinyin)
226
+ transformed_text = transformed_text.replace(f"<pinyin_{number}>", pinyin)
227
+ # print("normalized_text: ", normalized_text)
228
+ # print("transformed_text: ", transformed_text)
229
+ return transformed_text
230
+
231
+
232
+ class TextTokenizer:
233
+ def __init__(self, vocab_file: str, normalizer: TextNormalizer = None):
234
+ self.vocab_file = vocab_file
235
+ self.normalizer = normalizer
236
+
237
+ if self.vocab_file is None:
238
+ raise ValueError("vocab_file is None")
239
+ if not os.path.exists(self.vocab_file):
240
+ raise ValueError(f"vocab_file {self.vocab_file} does not exist")
241
+ if self.normalizer:
242
+ self.normalizer.load()
243
+ # 加载词表
244
+ self.sp_model = SentencePieceProcessor(model_file=self.vocab_file)
245
+
246
+ self.pre_tokenizers = [
247
+ # 预处理器
248
+ tokenize_by_CJK_char,
249
+ ]
250
+
251
+ @property
252
+ def vocab_size(self):
253
+ return self.sp_model.GetPieceSize()
254
+
255
+ @property
256
+ def unk_token(self):
257
+ return "<unk>"
258
+
259
+ @property
260
+ def pad_token(self):
261
+ return None
262
+
263
+ @property
264
+ def bos_token(self):
265
+ return "<s>"
266
+
267
+ @property
268
+ def eos_token(self):
269
+ return "</s>"
270
+
271
+ @property
272
+ def pad_token_id(self):
273
+ return -1
274
+
275
+ @property
276
+ def bos_token_id(self):
277
+ return 0
278
+
279
+ @property
280
+ def eos_token_id(self):
281
+ return 1
282
+
283
+ @property
284
+ def unk_token_id(self):
285
+ return self.sp_model.unk_id()
286
+
287
+ @property
288
+ def special_tokens_map(self):
289
+ return {
290
+ "unk_token": self.unk_token,
291
+ "pad_token": self.pad_token,
292
+ "bos_token": self.bos_token,
293
+ "eos_token": self.eos_token,
294
+ }
295
+
296
+ def get_vocab(self):
297
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
298
+ return vocab
299
+
300
+ @overload
301
+ def convert_ids_to_tokens(self, ids: int) -> str: ...
302
+
303
+ @overload
304
+ def convert_ids_to_tokens(self, ids: List[int]) -> List[str]: ...
305
+
306
+ def convert_ids_to_tokens(self, ids: Union[List[int], int]):
307
+ return self.sp_model.IdToPiece(ids)
308
+
309
+ def convert_tokens_to_ids(self, tokens: Union[List[str], str]) -> List[int]:
310
+ if isinstance(tokens, str):
311
+ tokens = [tokens]
312
+ return [self.sp_model.PieceToId(token) for token in tokens]
313
+
314
+ def tokenize(self, text: str) -> List[str]:
315
+ return self.encode(text, out_type=str)
316
+
317
+ def encode(self, text: str, **kwargs):
318
+ if len(text) == 0:
319
+ return []
320
+ if len(text.strip()) == 1:
321
+ return self.sp_model.Encode(text, out_type=kwargs.pop("out_type", int), **kwargs)
322
+ # 预处理
323
+ if self.normalizer:
324
+ text = self.normalizer.normalize(text)
325
+ if len(self.pre_tokenizers) > 0:
326
+ for pre_tokenizer in self.pre_tokenizers:
327
+ text = pre_tokenizer(text)
328
+ return self.sp_model.Encode(text, out_type=kwargs.pop("out_type", int), **kwargs)
329
+
330
+ def batch_encode(self, texts: List[str], **kwargs):
331
+ # 预处理
332
+ if self.normalizer:
333
+ texts = [self.normalizer.normalize(text) for text in texts]
334
+ if len(self.pre_tokenizers) > 0:
335
+ for pre_tokenizer in self.pre_tokenizers:
336
+ texts = [pre_tokenizer(text) for text in texts]
337
+ return self.sp_model.Encode(texts, out_type=kwargs.pop("out_type", int), **kwargs)
338
+
339
+ def decode(self, ids: Union[List[int], int], do_lower_case=False, **kwargs):
340
+ if isinstance(ids, int):
341
+ ids = [ids]
342
+ decoded = self.sp_model.Decode(ids, out_type=kwargs.pop("out_type", str), **kwargs)
343
+ return de_tokenized_by_CJK_char(decoded, do_lower_case=do_lower_case)
344
+
345
+ @staticmethod
346
+ def split_sentences_by_token(
347
+ tokenized_str: List[str], split_tokens: List[str], max_tokens_per_sentence: int
348
+ ) -> List[List[str]]:
349
+ """
350
+ 将tokenize后的结果按特定token进一步分割
351
+ """
352
+ # 处理特殊情况
353
+ if len(tokenized_str) == 0:
354
+ return []
355
+ sentences: List[List[str]] = []
356
+ current_sentence = []
357
+ current_sentence_tokens_len = 0
358
+ for i in range(len(tokenized_str)):
359
+ token = tokenized_str[i]
360
+ current_sentence.append(token)
361
+ current_sentence_tokens_len += 1
362
+ if current_sentence_tokens_len <= max_tokens_per_sentence:
363
+ if token in split_tokens and current_sentence_tokens_len > 2:
364
+ if i < len(tokenized_str) - 1:
365
+ if tokenized_str[i + 1] in ["'", "▁'"]:
366
+ # 后续token是',则不切分
367
+ current_sentence.append(tokenized_str[i + 1])
368
+ i += 1
369
+ sentences.append(current_sentence)
370
+ current_sentence = []
371
+ current_sentence_tokens_len = 0
372
+ continue
373
+ # 如果当前tokens的长度超过最大限制
374
+ if not ("," in split_tokens or "▁," in split_tokens ) and ("," in current_sentence or "▁," in current_sentence):
375
+ # 如果当前tokens中有,,则按,分割
376
+ sub_sentences = TextTokenizer.split_sentences_by_token(
377
+ current_sentence, [",", "▁,"], max_tokens_per_sentence=max_tokens_per_sentence
378
+ )
379
+ elif "-" not in split_tokens and "-" in current_sentence:
380
+ # 没有,,则按-分割
381
+ sub_sentences = TextTokenizer.split_sentences_by_token(
382
+ current_sentence, ["-"], max_tokens_per_sentence=max_tokens_per_sentence
383
+ )
384
+ else:
385
+ # 按照长度分割
386
+ sub_sentences = []
387
+ for j in range(0, len(current_sentence), max_tokens_per_sentence):
388
+ if j + max_tokens_per_sentence < len(current_sentence):
389
+ sub_sentences.append(current_sentence[j : j + max_tokens_per_sentence])
390
+ else:
391
+ sub_sentences.append(current_sentence[j:])
392
+ warnings.warn(
393
+ f"The tokens length of sentence exceeds limit: {max_tokens_per_sentence}, "
394
+ f"Tokens in sentence: {current_sentence}."
395
+ "Maybe unexpected behavior",
396
+ RuntimeWarning,
397
+ )
398
+ sentences.extend(sub_sentences)
399
+ current_sentence = []
400
+ current_sentence_tokens_len = 0
401
+ if current_sentence_tokens_len > 0:
402
+ assert current_sentence_tokens_len <= max_tokens_per_sentence
403
+ sentences.append(current_sentence)
404
+ # 如果相邻的句子加起来长度小于最大限制,则合并
405
+ merged_sentences = []
406
+ for sentence in sentences:
407
+ if len(sentence) == 0:
408
+ continue
409
+ if len(merged_sentences) == 0:
410
+ merged_sentences.append(sentence)
411
+ elif len(merged_sentences[-1]) + len(sentence) <= max_tokens_per_sentence:
412
+ merged_sentences[-1] = merged_sentences[-1] + sentence
413
+ else:
414
+ merged_sentences.append(sentence)
415
+ return merged_sentences
416
+
417
+ punctuation_marks_tokens = [
418
+ ".",
419
+ "!",
420
+ "?",
421
+ "▁.",
422
+ # "▁!", # unk
423
+ "▁?",
424
+ "▁...", # ellipsis
425
+ ]
426
+ def split_sentences(self, tokenized: List[str], max_tokens_per_sentence=120) -> List[List[str]]:
427
+ return TextTokenizer.split_sentences_by_token(
428
+ tokenized, self.punctuation_marks_tokens, max_tokens_per_sentence=max_tokens_per_sentence
429
+ )
430
+
431
+
432
+ if __name__ == "__main__":
433
+ # 测试程序
434
+
435
+ text_normalizer = TextNormalizer()
436
+
437
+ cases = [
438
+ "IndexTTS 正式发布1.0版本了,效果666",
439
+ "晕XUAN4是一种GAN3觉",
440
+ "我爱你!",
441
+ "I love you!",
442
+ "“我爱你”的英语是“I love you”",
443
+ "2.5平方电线",
444
+ "共465篇,约315万字",
445
+ "2002年的第一场雪,下在了2003年",
446
+ "速度是10km/h",
447
+ "现在是北京时间2025年01月11日 20:00",
448
+ "他这条裤子是2012年买的,花了200块钱",
449
+ "电话:135-4567-8900",
450
+ "1键3连",
451
+ "他这条视频点赞3000+,评论1000+,收藏500+",
452
+ "这是1024元的手机,你要吗?",
453
+ "受不liao3你了",
454
+ "“衣裳”不读衣chang2,而是读衣shang5",
455
+ "最zhong4要的是:不要chong2蹈覆辙",
456
+ "不zuo1死就不会死",
457
+ "See you at 8:00 AM",
458
+ "8:00 AM 开会",
459
+ "Couting down 3, 2, 1, go!",
460
+ "数到3就开始:1、2、3",
461
+ "This sales for 2.5% off, only $12.5.",
462
+ "5G网络是4G网络的升级版,2G网络是3G网络的前身",
463
+ "苹果于2030/1/2发布新 iPhone 2X 系列手机,最低售价仅 ¥12999",
464
+ "这酒...里...有毒...",
465
+ # 异常case
466
+ "只有,,,才是最好的",
467
+ "babala2是什么?", # babala二是什么?
468
+ "用beta1测试", # 用beta一测试
469
+ "have you ever been to beta2?", # have you ever been to beta two?
470
+ "such as XTTS, CosyVoice2, Fish-Speech, and F5-TTS", # such as xtts,cosyvoice two,fish-speech,and f five-tts
471
+ "where's the money?", # where is the money?
472
+ "who's there?", # who is there?
473
+ "which's the best?", # which is the best?
474
+ "how's it going?", # how is it going?
475
+ "今天是个好日子 it's a good day", # 今天是个好日子 it is a good day
476
+ # 人名
477
+ "约瑟夫·高登-莱维特(Joseph Gordon-Levitt is an American actor)",
478
+ "蒂莫西·唐纳德·库克(英文名:Timothy Donald Cook),通称蒂姆·库克(Tim Cook),美国商业经理、工业工程师和工业开发商,现任苹果公司首席执行官。",
479
+ # 长句子
480
+ "《盗梦空间》是由美国华纳兄弟影片公司出品的电影,由克里斯托弗·诺兰执导并编剧,莱昂纳多·迪卡普里奥、玛丽昂·歌迪亚、约瑟夫·高登-莱维特、艾利奥特·佩吉、汤姆·哈迪等联袂主演,2010年7月16日在美国上映,2010年9月1日在中国内地上映,2020年8月28日在中国内地重映。影片剧情游走于梦境与现实之间,被定义为“发生在意识结构内的当代动作科幻片”,讲述了由莱昂纳多·迪卡普里奥扮演的造梦师,带领特工团队进入他人梦境,从他人的潜意识中盗取机密,并重塑他人梦境的故事。",
481
+ "清晨拉开窗帘,阳光洒在窗台的Bloomixy花艺礼盒上——薰衣草香薰蜡烛唤醒嗅觉,永生花束折射出晨露般光泽。设计师将“自然绽放美学”融入每个细节:手工陶瓷花瓶可作首饰收纳,香薰精油含依兰依兰舒缓配方。限量款附赠《365天插花灵感手册》,让每个平凡日子都有花开仪式感。\n宴会厅灯光暗下的刹那,Glimmeria星月系列耳坠开始发光——瑞士冷珐琅工艺让蓝宝石如银河流动,钛合金骨架仅3.2g无负重感。设计师秘密:内置微型重力感应器,随步伐产生0.01mm振幅,打造“行走的星光”。七夕限定礼盒含星座定制铭牌,让爱意如星辰永恒闪耀。",
482
+ "电影1:“黑暗骑士”(演员:克里斯蒂安·贝尔、希斯·莱杰;导演:克里斯托弗·诺兰);电影2:“盗梦空间”(演员:莱昂纳多·迪卡普里奥;导演:克里斯托弗·诺兰);电影3:“钢琴家”(演员:艾德里安·布洛迪;导演:罗曼·波兰斯基);电影4:“泰坦尼克号”(演员:莱昂纳多·迪卡普里奥;导演:詹姆斯·卡梅隆);电影5:“阿凡达”(演员:萨姆·沃辛顿;导演:詹姆斯·卡梅隆);电影6:“南方公园:大电影”(演员:马特·斯通、托马斯·艾恩格瑞;导演:特雷·帕克)",
483
+ ]
484
+ # 测试分词器
485
+ tokenizer = TextTokenizer(
486
+ vocab_file="checkpoints/bpe.model",
487
+ normalizer=text_normalizer,
488
+ )
489
+
490
+ codes = tokenizer.batch_encode(
491
+ cases,
492
+ out_type=int,
493
+ )
494
+
495
+ print(f"vocab_size: {tokenizer.vocab_size}")
496
+ # print(f"pad_token: {tokenizer.pad_token}, pad_token_id: {tokenizer.pad_token_id}")
497
+ print(f"bos_token: {tokenizer.bos_token}, bos_token_id: {tokenizer.bos_token_id}")
498
+ print(f"eos_token: {tokenizer.eos_token}, eos_token_id: {tokenizer.eos_token_id}")
499
+ print(f"unk_token: {tokenizer.unk_token}, unk_token_id: {tokenizer.unk_token_id}")
500
+ # 测试拼音 (8474-10201)
501
+ for id in range(8474, 10201):
502
+ pinyin = tokenizer.convert_ids_to_tokens(id)
503
+ if re.match(TextNormalizer.PINYIN_TONE_PATTERN, pinyin, re.IGNORECASE) is None:
504
+ print(f"{pinyin} should be matched")
505
+ for badcase in [
506
+ "beta1", "better1", "voice2", "bala2", "babala2", "hunger2"
507
+ ]:
508
+ if re.match(TextNormalizer.PINYIN_TONE_PATTERN, badcase, re.IGNORECASE) is not None:
509
+ print(f"{badcase} should not be matched!")
510
+ # 不应该有 unk_token_id
511
+ for t in set([*TextTokenizer.punctuation_marks_tokens, ",", "▁,", "-", "▁..."]):
512
+ tokens = tokenizer.convert_tokens_to_ids(t)
513
+ if tokenizer.unk_token_id in tokens:
514
+ print(f"Warning: {t} is unknown token")
515
+ print(f"`{t}`", "->", tokens, "->", tokenizer.convert_ids_to_tokens(tokens))
516
+ for ch in set(tokenizer.normalizer.zh_char_rep_map.values()):
517
+ # 测试 normalize后的字符能被分词器识别
518
+ print(f"`{ch}`", "->", tokenizer.sp_model.Encode(ch, out_type=str))
519
+ print(f"` {ch}`", "->", tokenizer.sp_model.Encode(f" {ch}", out_type=str))
520
+ max_tokens_per_sentence=120
521
+ for i in range(len(cases)):
522
+ print(f"原始文本: {cases[i]}")
523
+ print(f"Normalized: {text_normalizer.normalize(cases[i])}")
524
+ tokens = tokenizer.tokenize(cases[i])
525
+ print("Tokenzied: ", ", ".join([f"`{t}`" for t in tokens]))
526
+ sentences = tokenizer.split_sentences(tokens, max_tokens_per_sentence=max_tokens_per_sentence)
527
+ print("Splitted sentences count:", len(sentences))
528
+ if len(sentences) > 1:
529
+ for j in range(len(sentences)):
530
+ print(f" {j}, count:", len(sentences[j]), ", tokens:", "".join(sentences[j]))
531
+ if len(sentences[j]) > max_tokens_per_sentence:
532
+ print(f"Warning: sentence {j} is too long, length: {len(sentences[j])}")
533
+ #print(f"Token IDs (first 10): {codes[i][:10]}")
534
+ if tokenizer.unk_token in codes[i]:
535
+ print(f"Warning: `{cases[i]}` contains UNKNOWN token")
536
+ print(f"Decoded: {tokenizer.decode(codes[i], do_lower_case=True)}")
537
+ print("-" * 50)