Upload src\tts_patches\tokenizer.py with huggingface_hub
Browse files- src//tts_patches//tokenizer.py +230 -0
src//tts_patches//tokenizer.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Callable, Dict, List, Union
|
| 2 |
+
|
| 3 |
+
from TTS.tts.utils.text import cleaners
|
| 4 |
+
from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes
|
| 5 |
+
from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
|
| 6 |
+
from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer
|
| 7 |
+
from TTS.utils.generic_utils import get_import_path, import_class
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TTSTokenizer:
|
| 11 |
+
"""🐸TTS tokenizer to convert input characters to token IDs and back.
|
| 12 |
+
|
| 13 |
+
Token IDs for OOV chars are discarded but those are stored in `self.not_found_characters` for later.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
use_phonemes (bool):
|
| 17 |
+
Whether to use phonemes instead of characters. Defaults to False.
|
| 18 |
+
|
| 19 |
+
characters (Characters):
|
| 20 |
+
A Characters object to use for character-to-ID and ID-to-character mappings.
|
| 21 |
+
|
| 22 |
+
text_cleaner (callable):
|
| 23 |
+
A function to pre-process the text before tokenization and phonemization. Defaults to None.
|
| 24 |
+
|
| 25 |
+
phonemizer (Phonemizer):
|
| 26 |
+
A phonemizer object or a dict that maps language codes to phonemizer objects. Defaults to None.
|
| 27 |
+
|
| 28 |
+
Example:
|
| 29 |
+
|
| 30 |
+
>>> from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
| 31 |
+
>>> tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes())
|
| 32 |
+
>>> text = "Hello world!"
|
| 33 |
+
>>> ids = tokenizer.text_to_ids(text)
|
| 34 |
+
>>> text_hat = tokenizer.ids_to_text(ids)
|
| 35 |
+
>>> assert text == text_hat
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(
|
| 39 |
+
self,
|
| 40 |
+
use_phonemes=False,
|
| 41 |
+
text_cleaner: Callable = None,
|
| 42 |
+
characters: "BaseCharacters" = None,
|
| 43 |
+
phonemizer: Union["Phonemizer", Dict] = None,
|
| 44 |
+
add_blank: bool = False,
|
| 45 |
+
use_eos_bos=False,
|
| 46 |
+
):
|
| 47 |
+
self.text_cleaner = text_cleaner
|
| 48 |
+
self.use_phonemes = use_phonemes
|
| 49 |
+
self.add_blank = add_blank
|
| 50 |
+
self.use_eos_bos = use_eos_bos
|
| 51 |
+
self.characters = characters
|
| 52 |
+
self.not_found_characters = []
|
| 53 |
+
self.phonemizer = phonemizer
|
| 54 |
+
|
| 55 |
+
@property
|
| 56 |
+
def characters(self):
|
| 57 |
+
return self._characters
|
| 58 |
+
|
| 59 |
+
@characters.setter
|
| 60 |
+
def characters(self, new_characters):
|
| 61 |
+
self._characters = new_characters
|
| 62 |
+
self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
|
| 63 |
+
self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
|
| 64 |
+
|
| 65 |
+
def encode(self, text: str) -> List[int]:
|
| 66 |
+
"""
|
| 67 |
+
Encodes a string of text as a sequence of IDs.
|
| 68 |
+
MODIFIED: This version now uses the custom .tokenize() method from the characters class if it exists.
|
| 69 |
+
"""
|
| 70 |
+
# --- BẮT ĐẦU PHẦN SỬA ĐỔI ---
|
| 71 |
+
# Kiểm tra xem class characters có phương thức tokenize tùy chỉnh không
|
| 72 |
+
if hasattr(self.characters, 'tokenize') and callable(getattr(self.characters, 'tokenize')):
|
| 73 |
+
# Nếu có, sử dụng nó. Phương thức này sẽ trả về một list các phoneme (có thể đa ký tự)
|
| 74 |
+
# Ví dụ: 'maː³' -> ['m', 'aː', '³']
|
| 75 |
+
tokens = self.characters.tokenize(text)
|
| 76 |
+
else:
|
| 77 |
+
# Nếu không, giữ lại hành vi cũ là lặp qua từng ký tự
|
| 78 |
+
tokens = list(text)
|
| 79 |
+
# --- KẾT THÚC PHẦN SỬA ĐỔI ---
|
| 80 |
+
|
| 81 |
+
token_ids = []
|
| 82 |
+
# Lặp qua danh sách token đã được xử lý đúng
|
| 83 |
+
for token in tokens:
|
| 84 |
+
try:
|
| 85 |
+
idx = self.characters.char_to_id(token)
|
| 86 |
+
token_ids.append(idx)
|
| 87 |
+
except KeyError:
|
| 88 |
+
# Giữ lại logic xử lý lỗi cũ
|
| 89 |
+
if token not in self.not_found_characters:
|
| 90 |
+
self.not_found_characters.append(token)
|
| 91 |
+
print(f" [!] Token '{token}' not found in the vocabulary. Discarding it.")
|
| 92 |
+
return token_ids
|
| 93 |
+
|
| 94 |
+
def decode(self, token_ids: List[int]) -> str:
|
| 95 |
+
"""Decodes a sequence of IDs to a string of text."""
|
| 96 |
+
text = ""
|
| 97 |
+
for token_id in token_ids:
|
| 98 |
+
text += self.characters.id_to_char(token_id)
|
| 99 |
+
return text
|
| 100 |
+
|
| 101 |
+
def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument
|
| 102 |
+
"""Converts a string of text to a sequence of token IDs.
|
| 103 |
+
|
| 104 |
+
Args:
|
| 105 |
+
text(str):
|
| 106 |
+
The text to convert to token IDs.
|
| 107 |
+
|
| 108 |
+
language(str):
|
| 109 |
+
The language code of the text. Defaults to None.
|
| 110 |
+
|
| 111 |
+
TODO:
|
| 112 |
+
- Add support for language-specific processing.
|
| 113 |
+
|
| 114 |
+
1. Text normalizatin
|
| 115 |
+
2. Phonemization (if use_phonemes is True)
|
| 116 |
+
3. Add blank char between characters
|
| 117 |
+
4. Add BOS and EOS characters
|
| 118 |
+
5. Text to token IDs
|
| 119 |
+
"""
|
| 120 |
+
# TODO: text cleaner should pick the right routine based on the language
|
| 121 |
+
if self.text_cleaner is not None:
|
| 122 |
+
text = self.text_cleaner(text)
|
| 123 |
+
if self.use_phonemes:
|
| 124 |
+
text = self.phonemizer.phonemize(text, separator="", language=language)
|
| 125 |
+
text = self.encode(text)
|
| 126 |
+
if self.add_blank:
|
| 127 |
+
text = self.intersperse_blank_char(text, True)
|
| 128 |
+
if self.use_eos_bos:
|
| 129 |
+
text = self.pad_with_bos_eos(text)
|
| 130 |
+
return text
|
| 131 |
+
|
| 132 |
+
def ids_to_text(self, id_sequence: List[int]) -> str:
|
| 133 |
+
"""Converts a sequence of token IDs to a string of text."""
|
| 134 |
+
return self.decode(id_sequence)
|
| 135 |
+
|
| 136 |
+
def pad_with_bos_eos(self, char_sequence: List[str]):
|
| 137 |
+
"""Pads a sequence with the special BOS and EOS characters."""
|
| 138 |
+
return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
|
| 139 |
+
|
| 140 |
+
def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
|
| 141 |
+
"""Intersperses the blank character between characters in a sequence.
|
| 142 |
+
|
| 143 |
+
Use the ```blank``` character if defined else use the ```pad``` character.
|
| 144 |
+
"""
|
| 145 |
+
char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
|
| 146 |
+
result = [char_to_use] * (len(char_sequence) * 2 + 1)
|
| 147 |
+
result[1::2] = char_sequence
|
| 148 |
+
return result
|
| 149 |
+
|
| 150 |
+
def print_logs(self, level: int = 0):
|
| 151 |
+
indent = "\t" * level
|
| 152 |
+
print(f"{indent}| > add_blank: {self.add_blank}")
|
| 153 |
+
print(f"{indent}| > use_eos_bos: {self.use_eos_bos}")
|
| 154 |
+
print(f"{indent}| > use_phonemes: {self.use_phonemes}")
|
| 155 |
+
if self.use_phonemes:
|
| 156 |
+
print(f"{indent}| > phonemizer:")
|
| 157 |
+
self.phonemizer.print_logs(level + 1)
|
| 158 |
+
if len(self.not_found_characters) > 0:
|
| 159 |
+
print(f"{indent}| > {len(self.not_found_characters)} not found characters:")
|
| 160 |
+
for char in self.not_found_characters:
|
| 161 |
+
print(f"{indent}| > {char}")
|
| 162 |
+
|
| 163 |
+
@staticmethod
|
| 164 |
+
def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
|
| 165 |
+
"""Init Tokenizer object from config
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
config (Coqpit): Coqpit model config.
|
| 169 |
+
characters (BaseCharacters): Defines the model character set. If not set, use the default options based on
|
| 170 |
+
the config values. Defaults to None.
|
| 171 |
+
"""
|
| 172 |
+
# init cleaners
|
| 173 |
+
text_cleaner = None
|
| 174 |
+
if isinstance(config.text_cleaner, (str, list)):
|
| 175 |
+
text_cleaner = getattr(cleaners, config.text_cleaner)
|
| 176 |
+
|
| 177 |
+
# init characters
|
| 178 |
+
if characters is None:
|
| 179 |
+
# set characters based on defined characters class
|
| 180 |
+
if config.characters and config.characters.characters_class:
|
| 181 |
+
CharactersClass = import_class(config.characters.characters_class)
|
| 182 |
+
characters, new_config = CharactersClass.init_from_config(config)
|
| 183 |
+
# set characters based on config
|
| 184 |
+
else:
|
| 185 |
+
if config.use_phonemes:
|
| 186 |
+
# init phoneme set
|
| 187 |
+
characters, new_config = IPAPhonemes().init_from_config(config)
|
| 188 |
+
else:
|
| 189 |
+
# init character set
|
| 190 |
+
characters, new_config = Graphemes().init_from_config(config)
|
| 191 |
+
|
| 192 |
+
else:
|
| 193 |
+
characters, new_config = characters.init_from_config(config)
|
| 194 |
+
|
| 195 |
+
# set characters class
|
| 196 |
+
new_config.characters.characters_class = get_import_path(characters)
|
| 197 |
+
|
| 198 |
+
# init phonemizer
|
| 199 |
+
phonemizer = None
|
| 200 |
+
if config.use_phonemes:
|
| 201 |
+
if "phonemizer" in config and config.phonemizer == "multi_phonemizer":
|
| 202 |
+
lang_to_phonemizer_name = {}
|
| 203 |
+
for dataset in config.datasets:
|
| 204 |
+
if dataset.language != "":
|
| 205 |
+
lang_to_phonemizer_name[dataset.language] = dataset.phonemizer
|
| 206 |
+
else:
|
| 207 |
+
raise ValueError("Multi phonemizer requires language to be set for each dataset.")
|
| 208 |
+
phonemizer = MultiPhonemizer(lang_to_phonemizer_name)
|
| 209 |
+
else:
|
| 210 |
+
phonemizer_kwargs = {"language": config.phoneme_language}
|
| 211 |
+
if "phonemizer" in config and config.phonemizer:
|
| 212 |
+
phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs)
|
| 213 |
+
else:
|
| 214 |
+
try:
|
| 215 |
+
phonemizer = get_phonemizer_by_name(
|
| 216 |
+
DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
|
| 217 |
+
)
|
| 218 |
+
new_config.phonemizer = phonemizer.name()
|
| 219 |
+
except KeyError as e:
|
| 220 |
+
raise ValueError(
|
| 221 |
+
f"""No phonemizer found for language {config.phoneme_language}.
|
| 222 |
+
You may need to install a third party library for this language."""
|
| 223 |
+
) from e
|
| 224 |
+
|
| 225 |
+
return (
|
| 226 |
+
TTSTokenizer(
|
| 227 |
+
config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars
|
| 228 |
+
),
|
| 229 |
+
new_config,
|
| 230 |
+
)
|