Spaces:
Runtime error
Runtime error
| # Regex | |
| import re | |
| # Phonemizer | |
| from phonemizer.backend import EspeakBackend | |
| phonemizer_backend = EspeakBackend( | |
| language = 'en-us', | |
| preserve_punctuation = True, | |
| with_stress = True | |
| ) | |
| class NixTokenizerEN: | |
| def __init__( | |
| self, | |
| tokenizer_state, | |
| ): | |
| # Vocab and abbreviations dictionary | |
| self.vocab_dict = tokenizer_state["vocab_dict"] | |
| self.abbreviations_dict = tokenizer_state["abbreviations_dict"] | |
| # Regex recipe | |
| self.whitespace_regex = tokenizer_state["whitespace_regex"] | |
| self.abbreviations_regex = tokenizer_state["abbreviations_regex"] | |
| def __call__( | |
| self, | |
| texts, | |
| ): | |
| # 1. Phonemize input texts | |
| phonemes = [ self._collapse_whitespace( | |
| phonemizer_backend.phonemize( | |
| self._expand_abbreviations(text.lower()), | |
| strip = True, | |
| ) | |
| ) for text in texts ] | |
| # 2. Tokenize phonemes | |
| tokens = [ self._intersperse([self.vocab_dict[p] for p in phoneme], 0) for phoneme in phonemes ] | |
| # 3. Pad tokens | |
| tokens, tokens_lengths = self._pad_tokens(tokens) | |
| return tokens, tokens_lengths, phonemes | |
| def _expand_abbreviations( | |
| self, | |
| text | |
| ): | |
| for regex, replacement in self.abbreviations_regex: | |
| text = re.sub(regex, replacement, text) | |
| return text | |
| def _collapse_whitespace( | |
| self, | |
| text | |
| ): | |
| return re.sub(self.whitespace_regex, ' ', text) | |
| def _intersperse( | |
| self, | |
| lst, | |
| item, | |
| ): | |
| result = [item] * (len(lst) * 2 + 1) | |
| result[1::2] = lst | |
| return result | |
| def _pad_tokens( | |
| self, | |
| tokens, | |
| ): | |
| tokens_lengths = [len(token) for token in tokens] | |
| max_len = max(tokens_lengths) | |
| tokens = [token + [0 for _ in range(max_len - len(token))] for token in tokens] | |
| return tokens, tokens_lengths |