stephenhoang commited on
Commit
734a3e5
·
verified ·
1 Parent(s): eefa4f2

Upload src\tts_patches\tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src//tts_patches//tokenizer.py +230 -0
src//tts_patches//tokenizer.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Union
2
+
3
+ from TTS.tts.utils.text import cleaners
4
+ from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes
5
+ from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
6
+ from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer
7
+ from TTS.utils.generic_utils import get_import_path, import_class
8
+
9
+
10
+ class TTSTokenizer:
11
+ """🐸TTS tokenizer to convert input characters to token IDs and back.
12
+
13
+ Token IDs for OOV chars are discarded but those are stored in `self.not_found_characters` for later.
14
+
15
+ Args:
16
+ use_phonemes (bool):
17
+ Whether to use phonemes instead of characters. Defaults to False.
18
+
19
+ characters (Characters):
20
+ A Characters object to use for character-to-ID and ID-to-character mappings.
21
+
22
+ text_cleaner (callable):
23
+ A function to pre-process the text before tokenization and phonemization. Defaults to None.
24
+
25
+ phonemizer (Phonemizer):
26
+ A phonemizer object or a dict that maps language codes to phonemizer objects. Defaults to None.
27
+
28
+ Example:
29
+
30
+ >>> from TTS.tts.utils.text.tokenizer import TTSTokenizer
31
+ >>> tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes())
32
+ >>> text = "Hello world!"
33
+ >>> ids = tokenizer.text_to_ids(text)
34
+ >>> text_hat = tokenizer.ids_to_text(ids)
35
+ >>> assert text == text_hat
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ use_phonemes=False,
41
+ text_cleaner: Callable = None,
42
+ characters: "BaseCharacters" = None,
43
+ phonemizer: Union["Phonemizer", Dict] = None,
44
+ add_blank: bool = False,
45
+ use_eos_bos=False,
46
+ ):
47
+ self.text_cleaner = text_cleaner
48
+ self.use_phonemes = use_phonemes
49
+ self.add_blank = add_blank
50
+ self.use_eos_bos = use_eos_bos
51
+ self.characters = characters
52
+ self.not_found_characters = []
53
+ self.phonemizer = phonemizer
54
+
55
+ @property
56
+ def characters(self):
57
+ return self._characters
58
+
59
+ @characters.setter
60
+ def characters(self, new_characters):
61
+ self._characters = new_characters
62
+ self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
63
+ self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
64
+
65
+ def encode(self, text: str) -> List[int]:
66
+ """
67
+ Encodes a string of text as a sequence of IDs.
68
+ MODIFIED: This version now uses the custom .tokenize() method from the characters class if it exists.
69
+ """
70
+ # --- BẮT ĐẦU PHẦN SỬA ĐỔI ---
71
+ # Kiểm tra xem class characters có phương thức tokenize tùy chỉnh không
72
+ if hasattr(self.characters, 'tokenize') and callable(getattr(self.characters, 'tokenize')):
73
+ # Nếu có, sử dụng nó. Phương thức này sẽ trả về một list các phoneme (có thể đa ký tự)
74
+ # Ví dụ: 'maː³' -> ['m', 'aː', '³']
75
+ tokens = self.characters.tokenize(text)
76
+ else:
77
+ # Nếu không, giữ lại hành vi cũ là lặp qua từng ký tự
78
+ tokens = list(text)
79
+ # --- KẾT THÚC PHẦN SỬA ĐỔI ---
80
+
81
+ token_ids = []
82
+ # Lặp qua danh sách token đã được xử lý đúng
83
+ for token in tokens:
84
+ try:
85
+ idx = self.characters.char_to_id(token)
86
+ token_ids.append(idx)
87
+ except KeyError:
88
+ # Giữ lại logic xử lý lỗi cũ
89
+ if token not in self.not_found_characters:
90
+ self.not_found_characters.append(token)
91
+ print(f" [!] Token '{token}' not found in the vocabulary. Discarding it.")
92
+ return token_ids
93
+
94
+ def decode(self, token_ids: List[int]) -> str:
95
+ """Decodes a sequence of IDs to a string of text."""
96
+ text = ""
97
+ for token_id in token_ids:
98
+ text += self.characters.id_to_char(token_id)
99
+ return text
100
+
101
+ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument
102
+ """Converts a string of text to a sequence of token IDs.
103
+
104
+ Args:
105
+ text(str):
106
+ The text to convert to token IDs.
107
+
108
+ language(str):
109
+ The language code of the text. Defaults to None.
110
+
111
+ TODO:
112
+ - Add support for language-specific processing.
113
+
114
+ 1. Text normalizatin
115
+ 2. Phonemization (if use_phonemes is True)
116
+ 3. Add blank char between characters
117
+ 4. Add BOS and EOS characters
118
+ 5. Text to token IDs
119
+ """
120
+ # TODO: text cleaner should pick the right routine based on the language
121
+ if self.text_cleaner is not None:
122
+ text = self.text_cleaner(text)
123
+ if self.use_phonemes:
124
+ text = self.phonemizer.phonemize(text, separator="", language=language)
125
+ text = self.encode(text)
126
+ if self.add_blank:
127
+ text = self.intersperse_blank_char(text, True)
128
+ if self.use_eos_bos:
129
+ text = self.pad_with_bos_eos(text)
130
+ return text
131
+
132
+ def ids_to_text(self, id_sequence: List[int]) -> str:
133
+ """Converts a sequence of token IDs to a string of text."""
134
+ return self.decode(id_sequence)
135
+
136
+ def pad_with_bos_eos(self, char_sequence: List[str]):
137
+ """Pads a sequence with the special BOS and EOS characters."""
138
+ return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
139
+
140
+ def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
141
+ """Intersperses the blank character between characters in a sequence.
142
+
143
+ Use the ```blank``` character if defined else use the ```pad``` character.
144
+ """
145
+ char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
146
+ result = [char_to_use] * (len(char_sequence) * 2 + 1)
147
+ result[1::2] = char_sequence
148
+ return result
149
+
150
+ def print_logs(self, level: int = 0):
151
+ indent = "\t" * level
152
+ print(f"{indent}| > add_blank: {self.add_blank}")
153
+ print(f"{indent}| > use_eos_bos: {self.use_eos_bos}")
154
+ print(f"{indent}| > use_phonemes: {self.use_phonemes}")
155
+ if self.use_phonemes:
156
+ print(f"{indent}| > phonemizer:")
157
+ self.phonemizer.print_logs(level + 1)
158
+ if len(self.not_found_characters) > 0:
159
+ print(f"{indent}| > {len(self.not_found_characters)} not found characters:")
160
+ for char in self.not_found_characters:
161
+ print(f"{indent}| > {char}")
162
+
163
+ @staticmethod
164
+ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
165
+ """Init Tokenizer object from config
166
+
167
+ Args:
168
+ config (Coqpit): Coqpit model config.
169
+ characters (BaseCharacters): Defines the model character set. If not set, use the default options based on
170
+ the config values. Defaults to None.
171
+ """
172
+ # init cleaners
173
+ text_cleaner = None
174
+ if isinstance(config.text_cleaner, (str, list)):
175
+ text_cleaner = getattr(cleaners, config.text_cleaner)
176
+
177
+ # init characters
178
+ if characters is None:
179
+ # set characters based on defined characters class
180
+ if config.characters and config.characters.characters_class:
181
+ CharactersClass = import_class(config.characters.characters_class)
182
+ characters, new_config = CharactersClass.init_from_config(config)
183
+ # set characters based on config
184
+ else:
185
+ if config.use_phonemes:
186
+ # init phoneme set
187
+ characters, new_config = IPAPhonemes().init_from_config(config)
188
+ else:
189
+ # init character set
190
+ characters, new_config = Graphemes().init_from_config(config)
191
+
192
+ else:
193
+ characters, new_config = characters.init_from_config(config)
194
+
195
+ # set characters class
196
+ new_config.characters.characters_class = get_import_path(characters)
197
+
198
+ # init phonemizer
199
+ phonemizer = None
200
+ if config.use_phonemes:
201
+ if "phonemizer" in config and config.phonemizer == "multi_phonemizer":
202
+ lang_to_phonemizer_name = {}
203
+ for dataset in config.datasets:
204
+ if dataset.language != "":
205
+ lang_to_phonemizer_name[dataset.language] = dataset.phonemizer
206
+ else:
207
+ raise ValueError("Multi phonemizer requires language to be set for each dataset.")
208
+ phonemizer = MultiPhonemizer(lang_to_phonemizer_name)
209
+ else:
210
+ phonemizer_kwargs = {"language": config.phoneme_language}
211
+ if "phonemizer" in config and config.phonemizer:
212
+ phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs)
213
+ else:
214
+ try:
215
+ phonemizer = get_phonemizer_by_name(
216
+ DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
217
+ )
218
+ new_config.phonemizer = phonemizer.name()
219
+ except KeyError as e:
220
+ raise ValueError(
221
+ f"""No phonemizer found for language {config.phoneme_language}.
222
+ You may need to install a third party library for this language."""
223
+ ) from e
224
+
225
+ return (
226
+ TTSTokenizer(
227
+ config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars
228
+ ),
229
+ new_config,
230
+ )