| import logging |
| from collections.abc import Callable |
| from typing import Union |
|
|
| from TTS.tts.utils.text import cleaners |
| from TTS.tts.utils.text.characters import BaseCharacters, Graphemes, IPAPhonemes |
| from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name |
| from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer |
| from TTS.utils.generic_utils import get_import_path, import_class |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class TTSTokenizer: |
| """🐸TTS tokenizer to convert input characters to token IDs and back. |
| |
| Token IDs for OOV chars are discarded but those are stored in `self.not_found_characters` for later. |
| |
| Args: |
| use_phonemes (bool): |
| Whether to use phonemes instead of characters. Defaults to False. |
| |
| characters (Characters): |
| A Characters object to use for character-to-ID and ID-to-character mappings. |
| |
| text_cleaner (callable): |
| A function to pre-process the text before tokenization and phonemization. Defaults to None. |
| |
| phonemizer (Phonemizer): |
| A phonemizer object or a dict that maps language codes to phonemizer objects. Defaults to None. |
| |
| Example: |
| |
| >>> from TTS.tts.utils.text.tokenizer import TTSTokenizer |
| >>> tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes()) |
| >>> text = "Hello world!" |
| >>> ids = tokenizer.text_to_ids(text) |
| >>> text_hat = tokenizer.ids_to_text(ids) |
| >>> assert text == text_hat |
| """ |
|
|
| def __init__( |
| self, |
| use_phonemes: bool = False, |
| text_cleaner: Callable[[str], str] | None = None, |
| characters: BaseCharacters | None = None, |
| phonemizer: Union["Phonemizer", dict] | None = None, |
| add_blank: bool = False, |
| use_eos_bos: bool = False, |
| ): |
| self.text_cleaner = text_cleaner |
| self.use_phonemes = use_phonemes |
| self.add_blank = add_blank |
| self.use_eos_bos = use_eos_bos |
| self.characters = characters |
| self.not_found_characters = [] |
| self.phonemizer = phonemizer |
|
|
| @property |
| def characters(self): |
| return self._characters |
|
|
| @characters.setter |
| def characters(self, new_characters): |
| self._characters = new_characters |
| self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None |
| self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None |
|
|
| def encode(self, text: str) -> list[int]: |
| """Encodes a string of text as a sequence of IDs.""" |
| token_ids = [] |
| for char in text: |
| try: |
| idx = self.characters.char_to_id(char) |
| token_ids.append(idx) |
| except KeyError: |
| |
| if char not in self.not_found_characters: |
| self.not_found_characters.append(char) |
| logger.warning(text) |
| logger.warning("Character %s not found in the vocabulary. Discarding it.", repr(char)) |
| return token_ids |
|
|
| def decode(self, token_ids: list[int]) -> str: |
| """Decodes a sequence of IDs to a string of text.""" |
| text = "" |
| for token_id in token_ids: |
| text += self.characters.id_to_char(token_id) |
| return text |
|
|
| def text_to_ids(self, text: str, language: str | None = None) -> list[int]: |
| """Converts a string of text to a sequence of token IDs. |
| |
| Args: |
| text(str): |
| The text to convert to token IDs. |
| |
| language(str): |
| The language code of the text. Defaults to None. |
| |
| TODO: |
| - Add support for language-specific processing. |
| |
| 1. Text normalizatin |
| 2. Phonemization (if use_phonemes is True) |
| 3. Add blank char between characters |
| 4. Add BOS and EOS characters |
| 5. Text to token IDs |
| """ |
| |
| logger.debug("Tokenizer input text: %s", text) |
| if self.text_cleaner is not None: |
| text = self.text_cleaner(text) |
| logger.debug("Cleaned text: %s", text) |
| if self.use_phonemes: |
| text = self.phonemizer.phonemize(text, separator="", language=language) |
| logger.debug("Phonemes: %s", text) |
| text = self.encode(text) |
| if self.add_blank: |
| text = self.intersperse_blank_char(text, True) |
| if self.use_eos_bos: |
| text = self.pad_with_bos_eos(text) |
| return text |
|
|
| def ids_to_text(self, id_sequence: list[int]) -> str: |
| """Converts a sequence of token IDs to a string of text.""" |
| return self.decode(id_sequence) |
|
|
| def pad_with_bos_eos(self, char_sequence: list[str]): |
| """Pads a sequence with the special BOS and EOS characters.""" |
| return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id] |
|
|
| def intersperse_blank_char(self, char_sequence: list[str], use_blank_char: bool = False): |
| """Intersperses the blank character between characters in a sequence. |
| |
| Use the ```blank``` character if defined else use the ```pad``` character. |
| """ |
| char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad |
| result = [char_to_use] * (len(char_sequence) * 2 + 1) |
| result[1::2] = char_sequence |
| return result |
|
|
| def print_logs(self, level: int = 0): |
| indent = "\t" * level |
| logger.info("%s| add_blank: %s", indent, self.add_blank) |
| logger.info("%s| use_eos_bos: %s", indent, self.use_eos_bos) |
| logger.info("%s| use_phonemes: %s", indent, self.use_phonemes) |
| if self.use_phonemes: |
| logger.info("%s| phonemizer:", indent) |
| self.phonemizer.print_logs(level + 1) |
| if len(self.not_found_characters) > 0: |
| logger.info("%s| %d characters not found:", indent, len(self.not_found_characters)) |
| for char in self.not_found_characters: |
| logger.info("%s| %s", indent, char) |
|
|
| @staticmethod |
| def init_from_config(config: "Coqpit", characters: BaseCharacters | None = None): |
| """Init Tokenizer object from config |
| |
| Args: |
| config (Coqpit): Coqpit model config. |
| characters (BaseCharacters): Defines the model character set. If not set, use the default options based on |
| the config values. Defaults to None. |
| """ |
| |
| text_cleaner = None |
| if isinstance(config.text_cleaner, str | list): |
| text_cleaner = getattr(cleaners, config.text_cleaner) |
|
|
| |
| if characters is None: |
| |
| if config.characters and config.characters.characters_class: |
| CharactersClass = import_class(config.characters.characters_class) |
| if not issubclass(CharactersClass, BaseCharacters): |
| msg = f"{config.characters.characters_class} is not a subclass of BaseCharacters." |
| raise TypeError(msg) |
| characters, new_config = CharactersClass.init_from_config(config) |
| |
| else: |
| if config.use_phonemes: |
| |
| characters, new_config = IPAPhonemes().init_from_config(config) |
| else: |
| |
| characters, new_config = Graphemes().init_from_config(config) |
|
|
| else: |
| characters, new_config = characters.init_from_config(config) |
|
|
| |
| new_config.characters.characters_class = get_import_path(characters) |
|
|
| |
| phonemizer = None |
| if config.use_phonemes: |
| if "phonemizer" in config and config.phonemizer == "multi_phonemizer": |
| lang_to_phonemizer_name = {} |
| for dataset in config.datasets: |
| if dataset.language != "": |
| lang_to_phonemizer_name[dataset.language] = dataset.phonemizer |
| else: |
| raise ValueError("Multi phonemizer requires language to be set for each dataset.") |
| phonemizer = MultiPhonemizer(lang_to_phonemizer_name) |
| else: |
| phonemizer_kwargs = {"language": config.phoneme_language} |
| if "phonemizer" in config and config.phonemizer: |
| phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs) |
| else: |
| try: |
| phonemizer = get_phonemizer_by_name( |
| DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs |
| ) |
| new_config.phonemizer = phonemizer.name() |
| except KeyError as e: |
| raise ValueError( |
| f"""No phonemizer found for language {config.phoneme_language}. |
| You may need to install a third party library for this language.""" |
| ) from e |
|
|
| return ( |
| TTSTokenizer( |
| config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars |
| ), |
| new_config, |
| ) |
|
|