| |
| """ |
| ============================================================= |
| Sinhala Grapheme Tokenizer for TTS |
| ============================================================= |
| Tokenizes Sinhala text into orthographic syllables (aksharas) |
| for FastPitch character-level TTS training. |
| |
| Sinhala is a Brahmic abugida (U+0D80–U+0DFF): |
| - Consonants carry an inherent /a/ vowel |
| - Al-lakuna (virama) ් suppresses the inherent vowel |
| - ZWJ (U+200D) forms conjunct consonants: ක් + ZWJ + ෂ → ක්ෂ |
| - Vowel diacritics modify the inherent vowel |
| |
| An akshara (grapheme cluster) is the minimal unit: |
| - Independent vowel: අ, ආ, ඉ, ඊ, ... |
| - Consonant + optional vowel sign: ක, කා, කි, ... |
| - Conjunct: consonant + virama + ZWJ + consonant + vowel sign |
| |
| Why aksharas (not phonemes): |
| IndicTTS (arxiv:2211.09536) proved character-level input works |
| for all 13 Indic languages tested. No phoneme dictionary needed. |
| |
| Usage: |
| from sinhala_tokenizer import SinhalaTokenizer |
| tok = SinhalaTokenizer() |
| tokens = tok.tokenize("ශ්රී ලංකාව") |
| ids = tok.encode("ශ්රී ලංකාව") |
| text = tok.decode(ids) |
| |
| ============================================================= |
| """ |
|
|
| import json |
| import re |
| import unicodedata |
| from pathlib import Path |
| from typing import Dict, List, Optional, Tuple |
|
|
|
|
| |
| |
| |
|
|
| |
| SINHALA_VOWELS = ( |
| '\u0D85' |
| '\u0D86' |
| '\u0D87' |
| '\u0D88' |
| '\u0D89' |
| '\u0D8A' |
| '\u0D8B' |
| '\u0D8C' |
| '\u0D8D' |
| '\u0D8E' |
| '\u0D8F' |
| '\u0D90' |
| '\u0D91' |
| '\u0D92' |
| '\u0D93' |
| '\u0D94' |
| '\u0D95' |
| '\u0D96' |
| ) |
|
|
| |
| SINHALA_CONSONANTS = ( |
| '\u0D9A' |
| '\u0D9B' |
| '\u0D9C' |
| '\u0D9D' |
| '\u0D9E' |
| '\u0D9F' |
| '\u0DA0' |
| '\u0DA1' |
| '\u0DA2' |
| '\u0DA3' |
| '\u0DA4' |
| '\u0DA5' |
| '\u0DA6' |
| '\u0DA7' |
| '\u0DA8' |
| '\u0DA9' |
| '\u0DAA' |
| '\u0DAB' |
| '\u0DAC' |
| '\u0DAD' |
| '\u0DAE' |
| '\u0DAF' |
| '\u0DB0' |
| '\u0DB1' |
| '\u0DB3' |
| '\u0DB4' |
| '\u0DB5' |
| '\u0DB6' |
| '\u0DB7' |
| '\u0DB8' |
| '\u0DB9' |
| '\u0DBA' |
| '\u0DBB' |
| '\u0DBD' |
| '\u0DC0' |
| '\u0DC1' |
| '\u0DC2' |
| '\u0DC3' |
| '\u0DC4' |
| '\u0DC5' |
| '\u0DC6' |
| ) |
|
|
| |
| VIRAMA = '\u0DCA' |
|
|
| |
| ZWJ = '\u200D' |
|
|
| |
| SINHALA_VOWEL_SIGNS = ( |
| '\u0DCF' |
| '\u0DD0' |
| '\u0DD1' |
| '\u0DD2' |
| '\u0DD3' |
| '\u0DD4' |
| '\u0DD6' |
| '\u0DD8' |
| '\u0DD9' |
| '\u0DDA' |
| '\u0DDB' |
| '\u0DDC' |
| '\u0DDD' |
| '\u0DDE' |
| '\u0DDF' |
| ) |
|
|
| |
| ANUSVARA = '\u0D82' |
| VISARGA = '\u0D83' |
|
|
| |
| SINHALA_DIGITS = ''.join(chr(c) for c in range(0x0DE6, 0x0DF0)) |
|
|
| |
| VOWEL_SET = set(SINHALA_VOWELS) |
| CONSONANT_SET = set(SINHALA_CONSONANTS) |
| VOWEL_SIGN_SET = set(SINHALA_VOWEL_SIGNS) |
| MODIFIER_SET = VOWEL_SIGN_SET | {ANUSVARA, VISARGA, VIRAMA} |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| _C = f'[{"".join(SINHALA_CONSONANTS)}]' |
| _V = f'[{"".join(SINHALA_VOWELS)}]' |
| _VS = f'[{"".join(SINHALA_VOWEL_SIGNS)}]' |
| _VIR = re.escape(VIRAMA) |
| _ZWJ = re.escape(ZWJ) |
| _ANU = re.escape(ANUSVARA) |
| _VIS = re.escape(VISARGA) |
|
|
| |
| _CONJUNCT = f'{_VIR}{_ZWJ}?{_C}' |
|
|
| |
| AKSHARA_PATTERN = re.compile( |
| f'({_C}(?:{_CONJUNCT})*{_VS}?[{re.escape(ANUSVARA)}{re.escape(VISARGA)}]?)' |
| f'|({_V}[{re.escape(ANUSVARA)}{re.escape(VISARGA)}]?)' |
| f'|([{re.escape(ANUSVARA)}{re.escape(VISARGA)}])' |
| f'|([0-9{SINHALA_DIGITS}]+)' |
| f'|([!?.,;:\'"\\-–—…])' |
| f'|( +)' |
| f'|(.)', |
| re.UNICODE |
| ) |
|
|
|
|
| class SinhalaTokenizer: |
| """ |
| Sinhala grapheme (akshara) tokenizer for TTS. |
| |
| Designed for Coqui-TTS FastPitch training. |
| Maps Sinhala text → grapheme cluster sequence → integer IDs. |
| """ |
|
|
| |
| PAD = "<PAD>" |
| BOS = "<BOS>" |
| EOS = "<EOS>" |
| UNK = "<UNK>" |
| BLANK = "<BLNK>" |
| SPACE = " " |
|
|
| def __init__(self, vocab_path: Optional[str] = None): |
| """ |
| Initialize tokenizer. |
| |
| If vocab_path is provided, load existing vocabulary. |
| Otherwise, build default vocabulary from Unicode block. |
| """ |
| if vocab_path and Path(vocab_path).exists(): |
| self.load_vocab(vocab_path) |
| else: |
| self._build_default_vocab() |
|
|
| def _build_default_vocab(self): |
| """Build vocabulary from Sinhala Unicode block + common tokens.""" |
| self.token2id: Dict[str, int] = {} |
| self.id2token: Dict[int, str] = {} |
|
|
| |
| specials = [self.PAD, self.BOS, self.EOS, self.UNK, self.BLANK] |
| for i, tok in enumerate(specials): |
| self.token2id[tok] = i |
| self.id2token[i] = tok |
|
|
| idx = len(specials) |
|
|
| |
| self.token2id[self.SPACE] = idx |
| self.id2token[idx] = self.SPACE |
| idx += 1 |
|
|
| |
| for p in '!?.,;:\'"\\-': |
| self.token2id[p] = idx |
| self.id2token[idx] = p |
| idx += 1 |
|
|
| |
| for v in SINHALA_VOWELS: |
| self.token2id[v] = idx |
| self.id2token[idx] = v |
| idx += 1 |
|
|
| |
| for c in SINHALA_CONSONANTS: |
| self.token2id[c] = idx |
| self.id2token[idx] = c |
| idx += 1 |
|
|
| |
| for vs in SINHALA_VOWEL_SIGNS: |
| self.token2id[vs] = idx |
| self.id2token[idx] = vs |
| idx += 1 |
|
|
| |
| for m in [VIRAMA, ZWJ, ANUSVARA, VISARGA]: |
| if m not in self.token2id: |
| self.token2id[m] = idx |
| self.id2token[idx] = m |
| idx += 1 |
|
|
| |
| for d in '0123456789': |
| if d not in self.token2id: |
| self.token2id[d] = idx |
| self.id2token[idx] = d |
| idx += 1 |
|
|
| |
| for d in SINHALA_DIGITS: |
| if d not in self.token2id: |
| self.token2id[d] = idx |
| self.id2token[idx] = d |
| idx += 1 |
|
|
| self.vocab_size = len(self.token2id) |
|
|
| def normalize(self, text: str) -> str: |
| """ |
| Normalize Sinhala text for TTS. |
| |
| - NFC normalization |
| - Remove ZWNJ (keep ZWJ) |
| - Normalize punctuation |
| - Collapse whitespace |
| """ |
| text = unicodedata.normalize('NFC', text) |
| text = text.replace('\u200C', '') |
| text = text.replace('"', '"').replace('"', '"') |
| text = text.replace(''', "'").replace(''', "'") |
| text = text.replace(';', ',').replace(':', ',') |
| text = text.replace('(', '').replace(')', '') |
| text = ' '.join(text.split()) |
| return text.strip() |
|
|
| def tokenize(self, text: str, normalize: bool = True) -> List[str]: |
| """ |
| Tokenize text into grapheme clusters. |
| |
| Returns list of tokens (aksharas, punctuation, spaces). |
| """ |
| if normalize: |
| text = self.normalize(text) |
|
|
| tokens = [] |
| for match in AKSHARA_PATTERN.finditer(text): |
| token = match.group(0) |
| if token: |
| tokens.append(token) |
|
|
| return tokens |
|
|
| def encode(self, text: str, add_bos: bool = True, add_eos: bool = True) -> List[int]: |
| """ |
| Encode text to integer IDs. |
| |
| For multi-codepoint aksharas (conjuncts), each codepoint gets its own ID. |
| This is by design — FastPitch's character embedding handles the sequence, |
| and the attention/aligner learns the mapping to mel frames. |
| """ |
| if isinstance(text, str): |
| text = self.normalize(text) |
|
|
| ids = [] |
| if add_bos: |
| ids.append(self.token2id[self.BOS]) |
|
|
| for char in text: |
| if char in self.token2id: |
| ids.append(self.token2id[char]) |
| else: |
| ids.append(self.token2id[self.UNK]) |
|
|
| if add_eos: |
| ids.append(self.token2id[self.EOS]) |
|
|
| return ids |
|
|
| def decode(self, ids: List[int], strip_special: bool = True) -> str: |
| """Decode integer IDs back to text.""" |
| specials = {self.PAD, self.BOS, self.EOS, self.BLANK, self.UNK} |
| chars = [] |
| for idx in ids: |
| token = self.id2token.get(idx, self.UNK) |
| if strip_special and token in specials: |
| continue |
| chars.append(token) |
| return ''.join(chars) |
|
|
| def get_characters_string(self) -> str: |
| """ |
| Get all characters as a single string for Coqui-TTS config. |
| |
| Usage in Coqui config: |
| config.characters.characters = tokenizer.get_characters_string() |
| """ |
| |
| chars = [] |
| for token, idx in sorted(self.token2id.items(), key=lambda x: x[1]): |
| if token in {self.PAD, self.BOS, self.EOS, self.UNK, self.BLANK}: |
| continue |
| if token == self.SPACE: |
| continue |
| if token in '!?.,;:\'"\\-': |
| continue |
| if len(token) == 1: |
| chars.append(token) |
| return ''.join(chars) |
|
|
| def get_punctuations_string(self) -> str: |
| """Get punctuation characters for Coqui-TTS config.""" |
| return '!?.,;:\'"- ' |
|
|
| def save_vocab(self, path: str): |
| """Save vocabulary to JSON.""" |
| data = { |
| "token2id": self.token2id, |
| "vocab_size": self.vocab_size, |
| "description": "Sinhala TTS grapheme tokenizer vocabulary", |
| } |
| with open(path, "w", encoding="utf-8") as f: |
| json.dump(data, f, indent=2, ensure_ascii=False) |
|
|
| def load_vocab(self, path: str): |
| """Load vocabulary from JSON.""" |
| with open(path, "r", encoding="utf-8") as f: |
| data = json.load(f) |
| self.token2id = data["token2id"] |
| self.id2token = {int(v): k for k, v in self.token2id.items()} |
| self.vocab_size = len(self.token2id) |
|
|
| def expand_vocab(self, new_tokens: List[str]): |
| """Add new tokens to vocabulary (e.g., from training data).""" |
| for token in new_tokens: |
| if token not in self.token2id: |
| idx = len(self.token2id) |
| self.token2id[token] = idx |
| self.id2token[idx] = token |
| self.vocab_size = len(self.token2id) |
|
|
| def build_vocab_from_corpus(self, texts: List[str]): |
| """ |
| Scan corpus and add any characters not in default vocab. |
| |
| Call this after loading your training data to ensure |
| all characters are covered. |
| """ |
| unseen = set() |
| for text in texts: |
| text = self.normalize(text) |
| for char in text: |
| if char not in self.token2id: |
| unseen.add(char) |
|
|
| if unseen: |
| print(f"Adding {len(unseen)} new characters to vocab: {unseen}") |
| self.expand_vocab(sorted(unseen)) |
|
|
| def __len__(self): |
| return self.vocab_size |
|
|
| def __repr__(self): |
| return (f"SinhalaTokenizer(vocab_size={self.vocab_size}, " |
| f"vowels={len(SINHALA_VOWELS)}, " |
| f"consonants={len(SINHALA_CONSONANTS)}, " |
| f"vowel_signs={len(SINHALA_VOWEL_SIGNS)})") |
|
|
|
|
| |
| |
| |
|
|
| def syllabify(text: str) -> List[str]: |
| """ |
| Quick syllabification: split Sinhala text into aksharas. |
| |
| Standalone function (no tokenizer instance needed). |
| """ |
| text = unicodedata.normalize('NFC', text) |
| syllables = [] |
| for match in AKSHARA_PATTERN.finditer(text): |
| token = match.group(0) |
| if token and token.strip(): |
| syllables.append(token) |
| return syllables |
|
|
|
|
| def count_aksharas(text: str) -> int: |
| """Count number of aksharas (grapheme clusters) in text.""" |
| return len(syllabify(text)) |
|
|
|
|
| def is_sinhala(text: str) -> bool: |
| """Check if text is predominantly Sinhala.""" |
| sinhala_chars = sum(1 for c in text if '\u0D80' <= c <= '\u0DFF') |
| total_chars = sum(1 for c in text if not c.isspace()) |
| if total_chars == 0: |
| return False |
| return sinhala_chars / total_chars > 0.5 |
|
|
|
|
| def get_coqui_characters_config() -> dict: |
| """ |
| Get character config dict for Coqui-TTS FastPitch. |
| |
| Usage: |
| from TTS.tts.configs.shared_configs import CharactersConfig |
| char_config = CharactersConfig(**get_coqui_characters_config()) |
| """ |
| tok = SinhalaTokenizer() |
| return { |
| "pad": SinhalaTokenizer.PAD, |
| "eos": SinhalaTokenizer.EOS, |
| "bos": SinhalaTokenizer.BOS, |
| "blank": SinhalaTokenizer.BLANK, |
| "characters": tok.get_characters_string(), |
| "punctuations": tok.get_punctuations_string(), |
| "phonemes": None, |
| "is_unique": True, |
| } |
|
|
|
|
| |
| |
| |
| if __name__ == "__main__": |
| tok = SinhalaTokenizer() |
| print(f"Tokenizer: {tok}") |
| print(f"Vocab size: {tok.vocab_size}") |
| print() |
|
|
| |
| test_texts = [ |
| "ශ්රී ලංකාව", |
| "මෙය උදාහරණ වාක්යයකි.", |
| "සිංහල භාෂාව ඉතා සුන්දරයි!", |
| "ක්රිකට් ක්රීඩාව", |
| "බුද්ධ ශාසනය", |
| "123 දවස්", |
| ] |
|
|
| for text in test_texts: |
| print(f"Input: {text}") |
|
|
| tokens = tok.tokenize(text) |
| print(f"Aksharas: {tokens}") |
|
|
| ids = tok.encode(text) |
| print(f"IDs: {ids}") |
|
|
| decoded = tok.decode(ids) |
| print(f"Decoded: {decoded}") |
|
|
| |
| normalized = tok.normalize(text) |
| match = "✓" if decoded == normalized else "✗" |
| print(f"Match: {match}") |
| print() |
|
|
| |
| print("=" * 60) |
| print("FOR COQUI-TTS CONFIG:") |
| print("=" * 60) |
| config = get_coqui_characters_config() |
| print(f"characters: {repr(config['characters'])}") |
| print(f"punctuations: {repr(config['punctuations'])}") |
| print(f"Total unique characters: {len(config['characters'])}") |
|
|
| |
| vocab_path = "sinhala_vocab.json" |
| tok.save_vocab(vocab_path) |
| print(f"\nVocab saved to: {vocab_path}") |
|
|