| import os |
|
|
| import torch |
| from tokenizers import Tokenizer |
|
|
| from TTS.tts.utils.text.cleaners import english_cleaners |
|
|
| DEFAULT_VOCAB_FILE = os.path.join( |
| os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json" |
| ) |
|
|
|
|
| class VoiceBpeTokenizer: |
| def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, vocab_str=None): |
| self.tokenizer = None |
| if vocab_file is not None: |
| self.tokenizer = Tokenizer.from_file(vocab_file) |
| if vocab_str is not None: |
| self.tokenizer = Tokenizer.from_str(vocab_str) |
|
|
| def preprocess_text(self, txt): |
| txt = english_cleaners(txt) |
| return txt |
|
|
| def encode(self, txt): |
| txt = self.preprocess_text(txt) |
| txt = txt.replace(" ", "[SPACE]") |
| return self.tokenizer.encode(txt).ids |
|
|
| def decode(self, seq): |
| if isinstance(seq, torch.Tensor): |
| seq = seq.cpu().numpy() |
| txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "") |
| txt = txt.replace("[SPACE]", " ") |
| txt = txt.replace("[STOP]", "") |
| txt = txt.replace("[UNK]", "") |
| return txt |
|
|