text_utils.py · Seemanth/chiluka-tts at main

File size: 914 Bytes

f28049f

"""Text processing utilities for phoneme tokenization."""

_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»"" '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"

symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)

_symbol_to_id = {s: i for i, s in enumerate(symbols)}


class TextCleaner:
    """Converts phoneme strings to token IDs."""

    def __init__(self):
        self.word_index_dictionary = _symbol_to_id

    def __call__(self, text):
        indexes = []
        for char in text:
            if char in self.word_index_dictionary:
                indexes.append(self.word_index_dictionary[char])
        return indexes