| | """ |
| | V6 Tokenizer β char-level for Bulgarian TTS with MioCodec |
| | ========================================================== |
| | Same character set as V5, but adapted for: |
| | - MioCodec single codebook (no interleaving) |
| | - Speaker embedding (no speaker tokens in encoder input) |
| | """ |
| |
|
| | import re |
| | import torch |
| | from typing import Optional |
| |
|
| | from config import ( |
| | TEXT_CHARS, TEXT_OFFSET, AUDIO_OFFSET, |
| | SPECIAL_TOKENS, NUM_SPECIAL_TOKENS, CODEC_CODEBOOK_SIZE, |
| | TOTAL_VOCAB_SIZE, |
| | PAD_TOKEN_ID, START_OF_TEXT_TOKEN_ID, END_OF_TEXT_TOKEN_ID, |
| | START_OF_SPEECH_TOKEN_ID, END_OF_SPEECH_TOKEN_ID, |
| | is_audio_token, is_special_token, is_text_token, |
| | ) |
| |
|
| |
|
| | class TTSTokenizer: |
| | def __init__(self): |
| | self.char2id: dict[str, int] = {} |
| | self.id2char: dict[int, str] = {} |
| | for i, ch in enumerate(TEXT_CHARS): |
| | tid = TEXT_OFFSET + i |
| | self.char2id[ch] = tid |
| | self.id2char[tid] = ch |
| |
|
| | self._special_id_to_name = {v: k for k, v in SPECIAL_TOKENS.items()} |
| | self.vocab_size = TOTAL_VOCAB_SIZE |
| | self.text_vocab_size = len(TEXT_CHARS) |
| |
|
| | def normalize_text(self, text: str) -> str: |
| | text = re.sub(r'\s+', ' ', text).strip() |
| | text = re.sub(r'[ββ]', '-', text) |
| | text = re.sub(r'[«»β""]', '"', text) |
| | return text |
| |
|
| | def encode_text(self, text: str) -> list[int]: |
| | text = self.normalize_text(text) |
| | return [self.char2id[ch] for ch in text if ch in self.char2id] |
| |
|
| | def decode_text(self, ids: list[int]) -> str: |
| | return "".join(self.id2char.get(t, "") for t in ids if is_text_token(t)) |
| |
|
| | |
| |
|
| | def build_encoder_input(self, text: str) -> torch.Tensor: |
| | """ |
| | Encoder input: <sot> text_chars <eot> |
| | No speaker token β speaker info comes from embedding. |
| | """ |
| | text_ids = self.encode_text(text) |
| | seq = [START_OF_TEXT_TOKEN_ID] + text_ids + [END_OF_TEXT_TOKEN_ID] |
| | return torch.tensor(seq, dtype=torch.long) |
| |
|
| | def build_decoder_input(self, audio_codes: torch.Tensor) -> torch.Tensor: |
| | """ |
| | Decoder input: <sos> [audio_codes + AUDIO_OFFSET] <eos> |
| | audio_codes: raw MioCodec codes in [0, 12799] |
| | """ |
| | seq = ( |
| | [START_OF_SPEECH_TOKEN_ID] |
| | + (audio_codes + AUDIO_OFFSET).tolist() |
| | + [END_OF_SPEECH_TOKEN_ID] |
| | ) |
| | return torch.tensor(seq, dtype=torch.long) |
| |
|
| | def build_decoder_prefix(self) -> torch.Tensor: |
| | """For inference: just <sos> to start generation.""" |
| | return torch.tensor([START_OF_SPEECH_TOKEN_ID], dtype=torch.long) |
| |
|
| | def extract_audio_codes(self, sequence: torch.Tensor) -> Optional[torch.Tensor]: |
| | """Extract raw MioCodec codes from a token sequence.""" |
| | mask = torch.tensor([is_audio_token(t.item()) for t in sequence]) |
| | if not mask.any(): |
| | return None |
| | return sequence[mask] - AUDIO_OFFSET |
| |
|
| | def describe(self, seq: torch.Tensor, max_tok: int = 30) -> str: |
| | parts = [] |
| | for t in seq[:max_tok]: |
| | tid = t.item() |
| | if is_special_token(tid): |
| | parts.append(self._special_id_to_name.get(tid, f"<sp_{tid}>")) |
| | elif is_text_token(tid): |
| | ch = self.id2char.get(tid, "?") |
| | parts.append(ch if ch != " " else "Β·") |
| | elif is_audio_token(tid): |
| | code = tid - AUDIO_OFFSET |
| | parts.append(f"βͺ{code}") |
| | else: |
| | parts.append(f"?{tid}") |
| | r = " ".join(parts) |
| | if len(seq) > max_tok: |
| | r += f" ... [{len(seq) - max_tok} more]" |
| | return r |
| |
|