tokensandcharms commited on
Commit
6aa8f74
·
verified ·
1 Parent(s): 190b210

Create tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +869 -0
tokenizer.py ADDED
@@ -0,0 +1,869 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import textwrap
4
+ from functools import cached_property
5
+
6
+ import pypinyin
7
+ import torch
8
+ from hangul_romanize import Transliter
9
+ from hangul_romanize.rule import academic
10
+ from num2words import num2words
11
+ from spacy.lang.ar import Arabic
12
+ from spacy.lang.en import English
13
+ from spacy.lang.es import Spanish
14
+ from spacy.lang.ja import Japanese
15
+ from spacy.lang.zh import Chinese
16
+ from tokenizers import Tokenizer
17
+
18
+ from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
19
+
20
+
21
+ def get_spacy_lang(lang):
22
+ if lang == "zh":
23
+ return Chinese()
24
+ elif lang == "ja":
25
+ return Japanese()
26
+ elif lang == "ar":
27
+ return Arabic()
28
+ elif lang == "es":
29
+ return Spanish()
30
+ else:
31
+ # For most languages, Enlish does the job
32
+ return English()
33
+
34
+
35
+ def split_sentence(text, lang, text_split_length=250):
36
+ """Preprocess the input text"""
37
+ text_splits = []
38
+ if text_split_length is not None and len(text) >= text_split_length:
39
+ text_splits.append("")
40
+ nlp = get_spacy_lang(lang)
41
+ nlp.add_pipe("sentencizer")
42
+ doc = nlp(text)
43
+ for sentence in doc.sents:
44
+ if len(text_splits[-1]) + len(str(sentence)) <= text_split_length:
45
+ # if the last sentence + the current sentence is less than the text_split_length
46
+ # then add the current sentence to the last sentence
47
+ text_splits[-1] += " " + str(sentence)
48
+ text_splits[-1] = text_splits[-1].lstrip()
49
+ elif len(str(sentence)) > text_split_length:
50
+ # if the current sentence is greater than the text_split_length
51
+ for line in textwrap.wrap(
52
+ str(sentence),
53
+ width=text_split_length,
54
+ drop_whitespace=True,
55
+ break_on_hyphens=False,
56
+ tabsize=1,
57
+ ):
58
+ text_splits.append(str(line))
59
+ else:
60
+ text_splits.append(str(sentence))
61
+
62
+ if len(text_splits) > 1:
63
+ if text_splits[0] == "":
64
+ del text_splits[0]
65
+ else:
66
+ text_splits = [text.lstrip()]
67
+
68
+ return text_splits
69
+
70
+
71
+ _whitespace_re = re.compile(r"\s+")
72
+
73
+ # List of (regular expression, replacement) pairs for abbreviations:
74
+ _abbreviations = {
75
+ "en": [
76
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
77
+ for x in [
78
+ ("mrs", "misess"),
79
+ ("mr", "mister"),
80
+ ("dr", "doctor"),
81
+ ("st", "saint"),
82
+ ("co", "company"),
83
+ ("jr", "junior"),
84
+ ("maj", "major"),
85
+ ("gen", "general"),
86
+ ("drs", "doctors"),
87
+ ("rev", "reverend"),
88
+ ("lt", "lieutenant"),
89
+ ("hon", "honorable"),
90
+ ("sgt", "sergeant"),
91
+ ("capt", "captain"),
92
+ ("esq", "esquire"),
93
+ ("ltd", "limited"),
94
+ ("col", "colonel"),
95
+ ("ft", "fort"),
96
+ ]
97
+ ],
98
+ "es": [
99
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
100
+ for x in [
101
+ ("sra", "señora"),
102
+ ("sr", "señor"),
103
+ ("dr", "doctor"),
104
+ ("dra", "doctora"),
105
+ ("st", "santo"),
106
+ ("co", "compañía"),
107
+ ("jr", "junior"),
108
+ ("ltd", "limitada"),
109
+ ]
110
+ ],
111
+ "fr": [
112
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
113
+ for x in [
114
+ ("mme", "madame"),
115
+ ("mr", "monsieur"),
116
+ ("dr", "docteur"),
117
+ ("st", "saint"),
118
+ ("co", "compagnie"),
119
+ ("jr", "junior"),
120
+ ("ltd", "limitée"),
121
+ ]
122
+ ],
123
+ "de": [
124
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
125
+ for x in [
126
+ ("fr", "frau"),
127
+ ("dr", "doktor"),
128
+ ("st", "sankt"),
129
+ ("co", "firma"),
130
+ ("jr", "junior"),
131
+ ]
132
+ ],
133
+ "pt": [
134
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
135
+ for x in [
136
+ ("sra", "senhora"),
137
+ ("sr", "senhor"),
138
+ ("dr", "doutor"),
139
+ ("dra", "doutora"),
140
+ ("st", "santo"),
141
+ ("co", "companhia"),
142
+ ("jr", "júnior"),
143
+ ("ltd", "limitada"),
144
+ ]
145
+ ],
146
+ "it": [
147
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
148
+ for x in [
149
+ # ("sig.ra", "signora"),
150
+ ("sig", "signore"),
151
+ ("dr", "dottore"),
152
+ ("st", "santo"),
153
+ ("co", "compagnia"),
154
+ ("jr", "junior"),
155
+ ("ltd", "limitata"),
156
+ ]
157
+ ],
158
+ "pl": [
159
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
160
+ for x in [
161
+ ("p", "pani"),
162
+ ("m", "pan"),
163
+ ("dr", "doktor"),
164
+ ("sw", "święty"),
165
+ ("jr", "junior"),
166
+ ]
167
+ ],
168
+ "ar": [
169
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
170
+ for x in [
171
+ # There are not many common abbreviations in Arabic as in English.
172
+ ]
173
+ ],
174
+ "zh": [
175
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
176
+ for x in [
177
+ # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
178
+ ]
179
+ ],
180
+ "cs": [
181
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
182
+ for x in [
183
+ ("dr", "doktor"), # doctor
184
+ ("ing", "inženýr"), # engineer
185
+ ("p", "pan"), # Could also map to pani for woman but no easy way to do it
186
+ # Other abbreviations would be specialized and not as common.
187
+ ]
188
+ ],
189
+ "ru": [
190
+ (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
191
+ for x in [
192
+ ("г-жа", "госпожа"), # Mrs.
193
+ ("г-н", "господин"), # Mr.
194
+ ("д-р", "доктор"), # doctor
195
+ # Other abbreviations are less common or specialized.
196
+ ]
197
+ ],
198
+ "nl": [
199
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
200
+ for x in [
201
+ ("dhr", "de heer"), # Mr.
202
+ ("mevr", "mevrouw"), # Mrs.
203
+ ("dr", "dokter"), # doctor
204
+ ("jhr", "jonkheer"), # young lord or nobleman
205
+ # Dutch uses more abbreviations, but these are the most common ones.
206
+ ]
207
+ ],
208
+ "tr": [
209
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
210
+ for x in [
211
+ ("b", "bay"), # Mr.
212
+ ("byk", "büyük"), # büyük
213
+ ("dr", "doktor"), # doctor
214
+ # Add other Turkish abbreviations here if needed.
215
+ ]
216
+ ],
217
+ "hu": [
218
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
219
+ for x in [
220
+ ("dr", "doktor"), # doctor
221
+ ("b", "bácsi"), # Mr.
222
+ ("nőv", "nővér"), # nurse
223
+ # Add other Hungarian abbreviations here if needed.
224
+ ]
225
+ ],
226
+ "ko": [
227
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
228
+ for x in [
229
+ # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
230
+ ]
231
+ ],
232
+ "ja": [
233
+ (re.compile("\\b%s\\b" % x[0]), x[1])
234
+ for x in [
235
+ ("氏", "さん"), # Mr.
236
+ ("夫人", "おんなのひと"), # Mrs.
237
+ ("博士", "はかせ"), # Doctor or PhD
238
+ ("株", "株式会社"), # Corporation
239
+ ("有", "有限会社"), # Limited company
240
+ ("大学", "だいがく"), # University
241
+ ("先生", "せんせい"), # Teacher/Professor/Master
242
+ ("君", "くん") # Used at the end of boys' names to express familiarity or affection.
243
+ ]
244
+ ],
245
+ }
246
+
247
+
248
+ def expand_abbreviations_multilingual(text, lang="en"):
249
+ for regex, replacement in _abbreviations[lang]:
250
+ text = re.sub(regex, replacement, text)
251
+ return text
252
+
253
+
254
+ _symbols_multilingual = {
255
+ "en": [
256
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
257
+ for x in [
258
+ ("&", " and "),
259
+ ("@", " at "),
260
+ ("%", " percent "),
261
+ ("#", " hash "),
262
+ ("$", " dollar "),
263
+ ("£", " pound "),
264
+ ("°", " degree "),
265
+ ]
266
+ ],
267
+ "es": [
268
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
269
+ for x in [
270
+ ("&", " y "),
271
+ ("@", " arroba "),
272
+ ("%", " por ciento "),
273
+ ("#", " numeral "),
274
+ ("$", " dolar "),
275
+ ("£", " libra "),
276
+ ("°", " grados "),
277
+ ]
278
+ ],
279
+ "fr": [
280
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
281
+ for x in [
282
+ ("&", " et "),
283
+ ("@", " arobase "),
284
+ ("%", " pour cent "),
285
+ ("#", " dièse "),
286
+ ("$", " dollar "),
287
+ ("£", " livre "),
288
+ ("°", " degrés "),
289
+ ]
290
+ ],
291
+ "de": [
292
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
293
+ for x in [
294
+ ("&", " und "),
295
+ ("@", " at "),
296
+ ("%", " prozent "),
297
+ ("#", " raute "),
298
+ ("$", " dollar "),
299
+ ("£", " pfund "),
300
+ ("°", " grad "),
301
+ ]
302
+ ],
303
+ "pt": [
304
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
305
+ for x in [
306
+ ("&", " e "),
307
+ ("@", " arroba "),
308
+ ("%", " por cento "),
309
+ ("#", " cardinal "),
310
+ ("$", " dólar "),
311
+ ("£", " libra "),
312
+ ("°", " graus "),
313
+ ]
314
+ ],
315
+ "it": [
316
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
317
+ for x in [
318
+ ("&", " e "),
319
+ ("@", " chiocciola "),
320
+ ("%", " per cento "),
321
+ ("#", " cancelletto "),
322
+ ("$", " dollaro "),
323
+ ("£", " sterlina "),
324
+ ("°", " gradi "),
325
+ ]
326
+ ],
327
+ "pl": [
328
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
329
+ for x in [
330
+ ("&", " i "),
331
+ ("@", " małpa "),
332
+ ("%", " procent "),
333
+ ("#", " krzyżyk "),
334
+ ("$", " dolar "),
335
+ ("£", " funt "),
336
+ ("°", " stopnie "),
337
+ ]
338
+ ],
339
+ "ar": [
340
+ # Arabic
341
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
342
+ for x in [
343
+ ("&", " و "),
344
+ ("@", " على "),
345
+ ("%", " في المئة "),
346
+ ("#", " رقم "),
347
+ ("$", " دولار "),
348
+ ("£", " جنيه "),
349
+ ("°", " درجة "),
350
+ ]
351
+ ],
352
+ "zh": [
353
+ # Chinese
354
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
355
+ for x in [
356
+ ("&", " 和 "),
357
+ ("@", " 在 "),
358
+ ("%", " 百分之 "),
359
+ ("#", " 号 "),
360
+ ("$", " 美元 "),
361
+ ("£", " 英镑 "),
362
+ ("°", " 度 "),
363
+ ]
364
+ ],
365
+ "cs": [
366
+ # Czech
367
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
368
+ for x in [
369
+ ("&", " a "),
370
+ ("@", " na "),
371
+ ("%", " procento "),
372
+ ("#", " křížek "),
373
+ ("$", " dolar "),
374
+ ("£", " libra "),
375
+ ("°", " stupně "),
376
+ ]
377
+ ],
378
+ "ru": [
379
+ # Russian
380
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
381
+ for x in [
382
+ ("&", " и "),
383
+ ("@", " собака "),
384
+ ("%", " процентов "),
385
+ ("#", " номер "),
386
+ ("$", " доллар "),
387
+ ("£", " фунт "),
388
+ ("°", " градус "),
389
+ ]
390
+ ],
391
+ "nl": [
392
+ # Dutch
393
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
394
+ for x in [
395
+ ("&", " en "),
396
+ ("@", " bij "),
397
+ ("%", " procent "),
398
+ ("#", " hekje "),
399
+ ("$", " dollar "),
400
+ ("£", " pond "),
401
+ ("°", " graden "),
402
+ ]
403
+ ],
404
+ "tr": [
405
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
406
+ for x in [
407
+ ("&", " ve "),
408
+ ("@", " at "),
409
+ ("%", " yüzde "),
410
+ ("#", " diyez "),
411
+ ("$", " dolar "),
412
+ ("£", " sterlin "),
413
+ ("°", " derece "),
414
+ ]
415
+ ],
416
+ "hu": [
417
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
418
+ for x in [
419
+ ("&", " és "),
420
+ ("@", " kukac "),
421
+ ("%", " százalék "),
422
+ ("#", " kettőskereszt "),
423
+ ("$", " dollár "),
424
+ ("£", " font "),
425
+ ("°", " fok "),
426
+ ]
427
+ ],
428
+ "ko": [
429
+ # Korean
430
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
431
+ for x in [
432
+ ("&", " 그리고 "),
433
+ ("@", " 에 "),
434
+ ("%", " 퍼센트 "),
435
+ ("#", " 번호 "),
436
+ ("$", " 달러 "),
437
+ ("£", " 파운드 "),
438
+ ("°", " 도 "),
439
+ ]
440
+ ],
441
+ "ja": [
442
+ (re.compile(r"%s" % re.escape(x[0])), x[1])
443
+ for x in [
444
+ ("&", " と "),
445
+ ("@", " アットマーク "),
446
+ ("%", " パーセント "),
447
+ ("#", " ナンバー "),
448
+ ("$", " ドル "),
449
+ ("£", " ポンド "),
450
+ ("°", " 度"),
451
+ ]
452
+ ],
453
+ }
454
+
455
+
456
+ def expand_symbols_multilingual(text, lang="en"):
457
+ for regex, replacement in _symbols_multilingual[lang]:
458
+ text = re.sub(regex, replacement, text)
459
+ text = text.replace(" ", " ") # Ensure there are no double spaces
460
+ return text.strip()
461
+
462
+
463
+ _ordinal_re = {
464
+ "en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
465
+ "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"),
466
+ "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"),
467
+ "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"),
468
+ "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"),
469
+ "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"),
470
+ "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"),
471
+ "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"),
472
+ "cs": re.compile(r"([0-9]+)\.(?=\s|$)"), # In Czech, a dot is often used after the number to indicate ordinals.
473
+ "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
474
+ "nl": re.compile(r"([0-9]+)(de|ste|e)"),
475
+ "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
476
+ "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
477
+ "ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
478
+ "ja": re.compile(r"([0-9]+)(番|回|つ|目|等|位)")
479
+ }
480
+ _number_re = re.compile(r"[0-9]+")
481
+ _currency_re = {
482
+ "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
483
+ "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
484
+ "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
485
+ }
486
+
487
+ _comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
488
+ _dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b")
489
+ _decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
490
+
491
+
492
+ def _remove_commas(m):
493
+ text = m.group(0)
494
+ if "," in text:
495
+ text = text.replace(",", "")
496
+ return text
497
+
498
+
499
+ def _remove_dots(m):
500
+ text = m.group(0)
501
+ if "." in text:
502
+ text = text.replace(".", "")
503
+ return text
504
+
505
+
506
+ def _expand_decimal_point(m, lang="en"):
507
+ amount = m.group(1).replace(",", ".")
508
+ return num2words(float(amount), lang=lang if lang != "cs" else "cz")
509
+
510
+
511
+ def _expand_currency(m, lang="en", currency="USD"):
512
+ amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
513
+ full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz")
514
+
515
+ and_equivalents = {
516
+ "en": ", ",
517
+ "es": " con ",
518
+ "fr": " et ",
519
+ "de": " und ",
520
+ "pt": " e ",
521
+ "it": " e ",
522
+ "pl": ", ",
523
+ "cs": ", ",
524
+ "ru": ", ",
525
+ "nl": ", ",
526
+ "ar": ", ",
527
+ "tr": ", ",
528
+ "hu": ", ",
529
+ "ko": ", ",
530
+ }
531
+
532
+ if amount.is_integer():
533
+ last_and = full_amount.rfind(and_equivalents[lang])
534
+ if last_and != -1:
535
+ full_amount = full_amount[:last_and]
536
+
537
+ return full_amount
538
+
539
+
540
+ def _expand_ordinal(m, lang="en"):
541
+ return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
542
+
543
+
544
+ def _expand_number(m, lang="en"):
545
+ return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
546
+
547
+
548
+ def expand_numbers_multilingual(text, lang="en"):
549
+ if lang == "zh":
550
+ text = zh_num2words()(text)
551
+ else:
552
+ if lang in ["en", "ru"]:
553
+ text = re.sub(_comma_number_re, _remove_commas, text)
554
+ else:
555
+ text = re.sub(_dot_number_re, _remove_dots, text)
556
+ try:
557
+ text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
558
+ text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
559
+ text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
560
+ except:
561
+ pass
562
+ if lang != "tr":
563
+ text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
564
+ text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
565
+ text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
566
+ return text
567
+
568
+
569
+ def lowercase(text):
570
+ return text.lower()
571
+
572
+
573
+ def collapse_whitespace(text):
574
+ return re.sub(_whitespace_re, " ", text)
575
+
576
+
577
+ def multilingual_cleaners(text, lang):
578
+ text = text.replace('"', "")
579
+ if lang == "tr":
580
+ text = text.replace("İ", "i")
581
+ text = text.replace("Ö", "ö")
582
+ text = text.replace("Ü", "ü")
583
+ text = lowercase(text)
584
+ text = expand_numbers_multilingual(text, lang)
585
+ text = expand_abbreviations_multilingual(text, lang)
586
+ text = expand_symbols_multilingual(text, lang=lang)
587
+ text = collapse_whitespace(text)
588
+ return text
589
+
590
+
591
+ def basic_cleaners(text):
592
+ """Basic pipeline that lowercases and collapses whitespace without transliteration."""
593
+ text = lowercase(text)
594
+ text = collapse_whitespace(text)
595
+ return text
596
+
597
+
598
+ def chinese_transliterate(text):
599
+ return "".join(
600
+ [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]
601
+ )
602
+
603
+
604
+ def japanese_cleaners(text, katsu):
605
+ text = katsu.romaji(text)
606
+ text = lowercase(text)
607
+ return text
608
+
609
+
610
+ def korean_transliterate(text):
611
+ r = Transliter(academic)
612
+ return r.translit(text)
613
+
614
+
615
+ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json")
616
+
617
+
618
+ class VoiceBpeTokenizer:
619
+ def __init__(self, vocab_file=None):
620
+ self.tokenizer = None
621
+ if vocab_file is not None:
622
+ self.tokenizer = Tokenizer.from_file(vocab_file)
623
+ self.char_limits = {
624
+ "en": 250,
625
+ "de": 253,
626
+ "fr": 273,
627
+ "es": 239,
628
+ "it": 213,
629
+ "pt": 203,
630
+ "pl": 224,
631
+ "zh": 82,
632
+ "ar": 166,
633
+ "cs": 186,
634
+ "ru": 182,
635
+ "nl": 251,
636
+ "tr": 226,
637
+ "ja": 71,
638
+ "hu": 224,
639
+ "ko": 95,
640
+ }
641
+
642
+ @cached_property
643
+ def katsu(self):
644
+ import cutlet
645
+
646
+ return cutlet.Cutlet()
647
+
648
+ def check_input_length(self, txt, lang):
649
+ lang = lang.split("-")[0] # remove the region
650
+ limit = self.char_limits.get(lang, 250)
651
+ if len(txt) > limit:
652
+ print(
653
+ f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
654
+ )
655
+
656
+ def preprocess_text(self, txt, lang):
657
+ if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
658
+ txt = multilingual_cleaners(txt, lang)
659
+ if lang == "zh":
660
+ txt = chinese_transliterate(txt)
661
+ if lang == "ko":
662
+ txt = korean_transliterate(txt)
663
+ elif lang == "ja":
664
+ txt = japanese_cleaners(txt, self.katsu)
665
+ elif lang == "hi":
666
+ # @manmay will implement this
667
+ txt = basic_cleaners(txt)
668
+ else:
669
+ raise NotImplementedError(f"Language '{lang}' is not supported.")
670
+ return txt
671
+
672
+ def encode(self, txt, lang):
673
+ lang = lang.split("-")[0] # remove the region
674
+ self.check_input_length(txt, lang)
675
+ txt = self.preprocess_text(txt, lang)
676
+ lang = "zh-cn" if lang == "zh" else lang
677
+ txt = f"[{lang}]{txt}"
678
+ txt = txt.replace(" ", "[SPACE]")
679
+ return self.tokenizer.encode(txt).ids
680
+
681
+ def decode(self, seq):
682
+ if isinstance(seq, torch.Tensor):
683
+ seq = seq.cpu().numpy()
684
+ txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
685
+ txt = txt.replace("[SPACE]", " ")
686
+ txt = txt.replace("[STOP]", "")
687
+ txt = txt.replace("[UNK]", "")
688
+ return txt
689
+
690
+ def __len__(self):
691
+ return self.tokenizer.get_vocab_size()
692
+
693
+ def get_number_tokens(self):
694
+ return max(self.tokenizer.get_vocab().values()) + 1
695
+
696
+
697
+ def test_expand_numbers_multilingual():
698
+ test_cases = [
699
+ # English
700
+ ("In 12.5 seconds.", "In twelve point five seconds.", "en"),
701
+ ("There were 50 soldiers.", "There were fifty soldiers.", "en"),
702
+ ("This is a 1st test", "This is a first test", "en"),
703
+ ("That will be $20 sir.", "That will be twenty dollars sir.", "en"),
704
+ ("That will be 20€ sir.", "That will be twenty euro sir.", "en"),
705
+ ("That will be 20.15€ sir.", "That will be twenty euro, fifteen cents sir.", "en"),
706
+ ("That's 100,000.5.", "That's one hundred thousand point five.", "en"),
707
+ # French
708
+ ("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"),
709
+ ("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"),
710
+ ("Ceci est un 1er test", "Ceci est un premier test", "fr"),
711
+ ("Cela vous fera $20 monsieur.", "Cela vous fera vingt dollars monsieur.", "fr"),
712
+ ("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"),
713
+ ("Cela vous fera 20,15€ monsieur.", "Cela vous fera vingt euros et quinze centimes monsieur.", "fr"),
714
+ ("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"),
715
+ # German
716
+ ("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"),
717
+ ("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"),
718
+ ("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"), # Issue with gender
719
+ ("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"),
720
+ ("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"),
721
+ ("Das macht 20,15€ Herr.", "Das macht zwanzig Euro und fünfzehn Cent Herr.", "de"),
722
+ # Spanish
723
+ ("En 12,5 segundos.", "En doce punto cinco segundos.", "es"),
724
+ ("Había 50 soldados.", "Había cincuenta soldados.", "es"),
725
+ ("Este es un 1er test", "Este es un primero test", "es"),
726
+ ("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"),
727
+ ("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"),
728
+ ("Eso le costará 20,15€ señor.", "Eso le costará veinte euros con quince céntimos señor.", "es"),
729
+ # Italian
730
+ ("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"),
731
+ ("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"),
732
+ ("Questo è un 1° test", "Questo è un primo test", "it"),
733
+ ("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"),
734
+ ("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"),
735
+ ("Ti costerà 20,15€ signore.", "Ti costerà venti euro e quindici centesimi signore.", "it"),
736
+ # Portuguese
737
+ ("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"),
738
+ ("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"),
739
+ ("Este é um 1º teste", "Este é um primeiro teste", "pt"),
740
+ ("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"),
741
+ ("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"),
742
+ (
743
+ "Isso custará 20,15€ senhor.",
744
+ "Isso custará vinte euros e quinze cêntimos senhor.",
745
+ "pt",
746
+ ), # "cêntimos" should be "centavos" num2words issue
747
+ # Polish
748
+ ("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"),
749
+ ("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"),
750
+ ("To będzie kosztować 20€ panie.", "To będzie kosztować dwadzieścia euro panie.", "pl"),
751
+ ("To będzie kosztować 20,15€ panie.", "To będzie kosztować dwadzieścia euro, piętnaście centów panie.", "pl"),
752
+ # Arabic
753
+ ("في الـ 12,5 ثانية.", "في الـ اثنا عشر , خمسون ثانية.", "ar"),
754
+ ("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"),
755
+ # ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words
756
+ # ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'),
757
+ # Czech
758
+ ("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"),
759
+ ("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"),
760
+ ("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"),
761
+ ("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"),
762
+ # Russian
763
+ ("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"),
764
+ ("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"),
765
+ ("Это будет 20.15€ сэр.", "Это будет двадцать евро, пятнадцать центов сэр.", "ru"),
766
+ ("Это будет стоить 20€ господин.", "Это будет стоить двадцать евро господин.", "ru"),
767
+ # Dutch
768
+ ("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"),
769
+ ("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"),
770
+ ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"),
771
+ ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"),
772
+ # Chinese (Simplified)
773
+ ("在12.5秒内", "在十二点五秒内", "zh"),
774
+ ("有50名士兵", "有五十名士兵", "zh"),
775
+ # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
776
+ # ("那将是20€先生", '那将是二十欧元先生', 'zh'),
777
+ # Turkish
778
+ # ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR
779
+ ("50 asker vardı.", "elli asker vardı.", "tr"),
780
+ ("Bu 1. test", "Bu birinci test", "tr"),
781
+ # ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'),
782
+ # Hungarian
783
+ ("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"),
784
+ ("50 katona volt.", "ötven katona volt.", "hu"),
785
+ ("Ez az 1. teszt", "Ez az első teszt", "hu"),
786
+ # Korean
787
+ ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"),
788
+ ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"),
789
+ ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"),
790
+ ]
791
+ for a, b, lang in test_cases:
792
+ out = expand_numbers_multilingual(a, lang=lang)
793
+ assert out == b, f"'{out}' vs '{b}'"
794
+
795
+
796
+ def test_abbreviations_multilingual():
797
+ test_cases = [
798
+ # English
799
+ ("Hello Mr. Smith.", "Hello mister Smith.", "en"),
800
+ ("Dr. Jones is here.", "doctor Jones is here.", "en"),
801
+ # Spanish
802
+ ("Hola Sr. Garcia.", "Hola señor Garcia.", "es"),
803
+ ("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"),
804
+ # French
805
+ ("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"),
806
+ ("Mme. Moreau est absente aujourd'hui.", "madame Moreau est absente aujourd'hui.", "fr"),
807
+ # German
808
+ ("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"),
809
+ # Portuguese
810
+ ("Olá Sr. Silva.", "Olá senhor Silva.", "pt"),
811
+ ("Dra. Costa, você está disponível?", "doutora Costa, você está disponível?", "pt"),
812
+ # Italian
813
+ ("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"),
814
+ # ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern
815
+ # Polish
816
+ ("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"),
817
+ ("M. Nowak, czy mogę zadać pytanie?", "pan Nowak, czy mogę zadać pytanie?", "pl"),
818
+ # Czech
819
+ ("P. Novák", "pan Novák", "cs"),
820
+ ("Dr. Vojtěch", "doktor Vojtěch", "cs"),
821
+ # Dutch
822
+ ("Dhr. Jansen", "de heer Jansen", "nl"),
823
+ ("Mevr. de Vries", "mevrouw de Vries", "nl"),
824
+ # Russian
825
+ ("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"),
826
+ ("Д-р Смирнов здесь, чтобы увидеть вас.", "доктор Смирнов здесь, чтобы увидеть вас.", "ru"),
827
+ # Turkish
828
+ ("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"),
829
+ ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"),
830
+ # Hungarian
831
+ ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"),
832
+ ]
833
+
834
+ for a, b, lang in test_cases:
835
+ out = expand_abbreviations_multilingual(a, lang=lang)
836
+ assert out == b, f"'{out}' vs '{b}'"
837
+
838
+
839
+ def test_symbols_multilingual():
840
+ test_cases = [
841
+ ("I have 14% battery", "I have 14 percent battery", "en"),
842
+ ("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"),
843
+ ("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"),
844
+ ("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"),
845
+ ("O meu email é ana&joao@gmail.com", "O meu email é ana e joao arroba gmail.com", "pt"),
846
+ ("linguaggio di programmazione C#", "linguaggio di programmazione C cancelletto", "it"),
847
+ ("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"),
848
+ ("Mám 14% baterie", "Mám 14 procento baterie", "cs"),
849
+ ("Těším se na tebe @ party", "Těším se na tebe na party", "cs"),
850
+ ("У меня 14% заряда", "У меня 14 процентов заряда", "ru"),
851
+ ("Я буду @ дома", "Я буду собака дома", "ru"),
852
+ ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"),
853
+ ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"),
854
+ ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"),
855
+ ("我的电量为 14%", "我的电量为 14 百分之", "zh"),
856
+ ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
857
+ ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
858
+ ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
859
+ ]
860
+
861
+ for a, b, lang in test_cases:
862
+ out = expand_symbols_multilingual(a, lang=lang)
863
+ assert out == b, f"'{out}' vs '{b}'"
864
+
865
+
866
+ if __name__ == "__main__":
867
+ test_expand_numbers_multilingual()
868
+ test_abbreviations_multilingual()
869
+ test_symbols_multilingual()