#!/usr/bin/env python3 """ ============================================================= Sinhala Grapheme Tokenizer for TTS ============================================================= Tokenizes Sinhala text into orthographic syllables (aksharas) for FastPitch character-level TTS training. Sinhala is a Brahmic abugida (U+0D80–U+0DFF): - Consonants carry an inherent /a/ vowel - Al-lakuna (virama) ් suppresses the inherent vowel - ZWJ (U+200D) forms conjunct consonants: ක් + ZWJ + ෂ → ක්‍ෂ - Vowel diacritics modify the inherent vowel An akshara (grapheme cluster) is the minimal unit: - Independent vowel: අ, ආ, ඉ, ඊ, ... - Consonant + optional vowel sign: ක, කා, කි, ... - Conjunct: consonant + virama + ZWJ + consonant + vowel sign Why aksharas (not phonemes): IndicTTS (arxiv:2211.09536) proved character-level input works for all 13 Indic languages tested. No phoneme dictionary needed. Usage: from sinhala_tokenizer import SinhalaTokenizer tok = SinhalaTokenizer() tokens = tok.tokenize("ශ්‍රී ලංකාව") ids = tok.encode("ශ්‍රී ලංකාව") text = tok.decode(ids) ============================================================= """ import json import re import unicodedata from pathlib import Path from typing import Dict, List, Optional, Tuple # ============================================================ # Unicode Constants for Sinhala (U+0D80–U+0DFF) # ============================================================ # Independent vowels (can stand alone) SINHALA_VOWELS = ( '\u0D85' # අ (a) '\u0D86' # ආ (aa) '\u0D87' # ඇ (ae) '\u0D88' # ඈ (aae) '\u0D89' # ඉ (i) '\u0D8A' # ඊ (ii) '\u0D8B' # උ (u) '\u0D8C' # ඌ (uu) '\u0D8D' # ඍ (ri) '\u0D8E' # ඎ (rii) - rare '\u0D8F' # ඏ (li) - rare '\u0D90' # ඐ (lii) - rare '\u0D91' # එ (e) '\u0D92' # ඒ (ee) '\u0D93' # ඓ (ai) '\u0D94' # ඔ (o) '\u0D95' # ඕ (oo) '\u0D96' # ඖ (au) ) # Consonants SINHALA_CONSONANTS = ( '\u0D9A' # ක (ka) '\u0D9B' # ඛ (kha) '\u0D9C' # ග (ga) '\u0D9D' # ඝ (gha) '\u0D9E' # ඞ (nga) '\u0D9F' # ඟ (nnga) - sanyaka (prenasalized) '\u0DA0' # ච (cha) '\u0DA1' # ඡ (chha) '\u0DA2' # ජ (ja) '\u0DA3' # ඣ (jha) '\u0DA4' # ඤ (nya) '\u0DA5' # ඥ (jnya) '\u0DA6' # ඦ (nyja) - sanyaka '\u0DA7' # ට (tta) '\u0DA8' # ඨ (ttha) '\u0DA9' # ඩ (dda) '\u0DAA' # ඪ (ddha) '\u0DAB' # ණ (nna) '\u0DAC' # ඬ (ndda) - sanyaka '\u0DAD' # ත (ta) '\u0DAE' # ථ (tha) '\u0DAF' # ද (da) '\u0DB0' # ධ (dha) '\u0DB1' # න (na) '\u0DB3' # ඳ (nda) - sanyaka '\u0DB4' # ප (pa) '\u0DB5' # ඵ (pha) '\u0DB6' # බ (ba) '\u0DB7' # භ (bha) '\u0DB8' # ම (ma) '\u0DB9' # ඹ (mba) - sanyaka '\u0DBA' # ය (ya) '\u0DBB' # ර (ra) '\u0DBD' # ල (la) '\u0DC0' # ව (va/wa) '\u0DC1' # ශ (sha) '\u0DC2' # ෂ (ssa) '\u0DC3' # ස (sa) '\u0DC4' # හ (ha) '\u0DC5' # ළ (lla) '\u0DC6' # ෆ (fa) - used for foreign words ) # Virama (Al-Lakuna) — kills inherent vowel VIRAMA = '\u0DCA' # ් # Zero-Width Joiner — forms visual conjuncts ZWJ = '\u200D' # Dependent vowel signs (diacritics on consonants) SINHALA_VOWEL_SIGNS = ( '\u0DCF' # ා (aa) '\u0DD0' # ැ (ae) '\u0DD1' # ෑ (aae) '\u0DD2' # ි (i) '\u0DD3' # ී (ii) '\u0DD4' # ු (u) '\u0DD6' # ූ (uu) '\u0DD8' # ෘ (ri) '\u0DD9' # ෙ (e) — pre-base '\u0DDA' # ේ (ee) '\u0DDB' # ෛ (ai) '\u0DDC' # ො (o) '\u0DDD' # ෝ (oo) '\u0DDE' # ෞ (au) '\u0DDF' # ෟ (li) — rare ) # Anusvara and Visarga ANUSVARA = '\u0D82' # ං VISARGA = '\u0D83' # ඃ # Sinhala digits SINHALA_DIGITS = ''.join(chr(c) for c in range(0x0DE6, 0x0DF0)) # ෦-෯ # Sets for fast lookup VOWEL_SET = set(SINHALA_VOWELS) CONSONANT_SET = set(SINHALA_CONSONANTS) VOWEL_SIGN_SET = set(SINHALA_VOWEL_SIGNS) MODIFIER_SET = VOWEL_SIGN_SET | {ANUSVARA, VISARGA, VIRAMA} # ============================================================ # Regex pattern for Sinhala grapheme clusters (aksharas) # ============================================================ # An akshara is: # (Consonant (Virama ZWJ? Consonant)*) VowelSign? (Anusvara|Visarga)? # | IndependentVowel (Anusvara|Visarga)? # | Anusvara | Visarga (standalone) _C = f'[{"".join(SINHALA_CONSONANTS)}]' # consonant _V = f'[{"".join(SINHALA_VOWELS)}]' # independent vowel _VS = f'[{"".join(SINHALA_VOWEL_SIGNS)}]' # vowel sign _VIR = re.escape(VIRAMA) _ZWJ = re.escape(ZWJ) _ANU = re.escape(ANUSVARA) _VIS = re.escape(VISARGA) # Conjunct consonant: virama + optional ZWJ + consonant _CONJUNCT = f'{_VIR}{_ZWJ}?{_C}' # Full akshara pattern AKSHARA_PATTERN = re.compile( f'({_C}(?:{_CONJUNCT})*{_VS}?[{re.escape(ANUSVARA)}{re.escape(VISARGA)}]?)' # consonant cluster f'|({_V}[{re.escape(ANUSVARA)}{re.escape(VISARGA)}]?)' # independent vowel f'|([{re.escape(ANUSVARA)}{re.escape(VISARGA)}])' # standalone modifier f'|([0-9{SINHALA_DIGITS}]+)' # number f'|([!?.,;:\'"\\-–—…])' # punctuation f'|( +)' # space(s) f'|(.)', # anything else (1 char) re.UNICODE ) class SinhalaTokenizer: """ Sinhala grapheme (akshara) tokenizer for TTS. Designed for Coqui-TTS FastPitch training. Maps Sinhala text → grapheme cluster sequence → integer IDs. """ # Special tokens PAD = "" BOS = "" EOS = "" UNK = "" BLANK = "" SPACE = " " def __init__(self, vocab_path: Optional[str] = None): """ Initialize tokenizer. If vocab_path is provided, load existing vocabulary. Otherwise, build default vocabulary from Unicode block. """ if vocab_path and Path(vocab_path).exists(): self.load_vocab(vocab_path) else: self._build_default_vocab() def _build_default_vocab(self): """Build vocabulary from Sinhala Unicode block + common tokens.""" self.token2id: Dict[str, int] = {} self.id2token: Dict[int, str] = {} # Special tokens first (indices 0-4) specials = [self.PAD, self.BOS, self.EOS, self.UNK, self.BLANK] for i, tok in enumerate(specials): self.token2id[tok] = i self.id2token[i] = tok idx = len(specials) # Space self.token2id[self.SPACE] = idx self.id2token[idx] = self.SPACE idx += 1 # Punctuation for p in '!?.,;:\'"\\-': self.token2id[p] = idx self.id2token[idx] = p idx += 1 # Independent vowels for v in SINHALA_VOWELS: self.token2id[v] = idx self.id2token[idx] = v idx += 1 # Consonants for c in SINHALA_CONSONANTS: self.token2id[c] = idx self.id2token[idx] = c idx += 1 # Vowel signs for vs in SINHALA_VOWEL_SIGNS: self.token2id[vs] = idx self.id2token[idx] = vs idx += 1 # Modifiers for m in [VIRAMA, ZWJ, ANUSVARA, VISARGA]: if m not in self.token2id: self.token2id[m] = idx self.id2token[idx] = m idx += 1 # Arabic digits for d in '0123456789': if d not in self.token2id: self.token2id[d] = idx self.id2token[idx] = d idx += 1 # Sinhala digits for d in SINHALA_DIGITS: if d not in self.token2id: self.token2id[d] = idx self.id2token[idx] = d idx += 1 self.vocab_size = len(self.token2id) def normalize(self, text: str) -> str: """ Normalize Sinhala text for TTS. - NFC normalization - Remove ZWNJ (keep ZWJ) - Normalize punctuation - Collapse whitespace """ text = unicodedata.normalize('NFC', text) text = text.replace('\u200C', '') # Remove ZWNJ text = text.replace('"', '"').replace('"', '"') text = text.replace(''', "'").replace(''', "'") text = text.replace(';', ',').replace(':', ',') text = text.replace('(', '').replace(')', '') text = ' '.join(text.split()) return text.strip() def tokenize(self, text: str, normalize: bool = True) -> List[str]: """ Tokenize text into grapheme clusters. Returns list of tokens (aksharas, punctuation, spaces). """ if normalize: text = self.normalize(text) tokens = [] for match in AKSHARA_PATTERN.finditer(text): token = match.group(0) if token: tokens.append(token) return tokens def encode(self, text: str, add_bos: bool = True, add_eos: bool = True) -> List[int]: """ Encode text to integer IDs. For multi-codepoint aksharas (conjuncts), each codepoint gets its own ID. This is by design — FastPitch's character embedding handles the sequence, and the attention/aligner learns the mapping to mel frames. """ if isinstance(text, str): text = self.normalize(text) ids = [] if add_bos: ids.append(self.token2id[self.BOS]) for char in text: if char in self.token2id: ids.append(self.token2id[char]) else: ids.append(self.token2id[self.UNK]) if add_eos: ids.append(self.token2id[self.EOS]) return ids def decode(self, ids: List[int], strip_special: bool = True) -> str: """Decode integer IDs back to text.""" specials = {self.PAD, self.BOS, self.EOS, self.BLANK, self.UNK} chars = [] for idx in ids: token = self.id2token.get(idx, self.UNK) if strip_special and token in specials: continue chars.append(token) return ''.join(chars) def get_characters_string(self) -> str: """ Get all characters as a single string for Coqui-TTS config. Usage in Coqui config: config.characters.characters = tokenizer.get_characters_string() """ # All Sinhala characters except specials, space, and punctuation chars = [] for token, idx in sorted(self.token2id.items(), key=lambda x: x[1]): if token in {self.PAD, self.BOS, self.EOS, self.UNK, self.BLANK}: continue if token == self.SPACE: continue if token in '!?.,;:\'"\\-': continue if len(token) == 1: # single characters only chars.append(token) return ''.join(chars) def get_punctuations_string(self) -> str: """Get punctuation characters for Coqui-TTS config.""" return '!?.,;:\'"- ' def save_vocab(self, path: str): """Save vocabulary to JSON.""" data = { "token2id": self.token2id, "vocab_size": self.vocab_size, "description": "Sinhala TTS grapheme tokenizer vocabulary", } with open(path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) def load_vocab(self, path: str): """Load vocabulary from JSON.""" with open(path, "r", encoding="utf-8") as f: data = json.load(f) self.token2id = data["token2id"] self.id2token = {int(v): k for k, v in self.token2id.items()} self.vocab_size = len(self.token2id) def expand_vocab(self, new_tokens: List[str]): """Add new tokens to vocabulary (e.g., from training data).""" for token in new_tokens: if token not in self.token2id: idx = len(self.token2id) self.token2id[token] = idx self.id2token[idx] = token self.vocab_size = len(self.token2id) def build_vocab_from_corpus(self, texts: List[str]): """ Scan corpus and add any characters not in default vocab. Call this after loading your training data to ensure all characters are covered. """ unseen = set() for text in texts: text = self.normalize(text) for char in text: if char not in self.token2id: unseen.add(char) if unseen: print(f"Adding {len(unseen)} new characters to vocab: {unseen}") self.expand_vocab(sorted(unseen)) def __len__(self): return self.vocab_size def __repr__(self): return (f"SinhalaTokenizer(vocab_size={self.vocab_size}, " f"vowels={len(SINHALA_VOWELS)}, " f"consonants={len(SINHALA_CONSONANTS)}, " f"vowel_signs={len(SINHALA_VOWEL_SIGNS)})") # ============================================================ # Utility functions # ============================================================ def syllabify(text: str) -> List[str]: """ Quick syllabification: split Sinhala text into aksharas. Standalone function (no tokenizer instance needed). """ text = unicodedata.normalize('NFC', text) syllables = [] for match in AKSHARA_PATTERN.finditer(text): token = match.group(0) if token and token.strip(): syllables.append(token) return syllables def count_aksharas(text: str) -> int: """Count number of aksharas (grapheme clusters) in text.""" return len(syllabify(text)) def is_sinhala(text: str) -> bool: """Check if text is predominantly Sinhala.""" sinhala_chars = sum(1 for c in text if '\u0D80' <= c <= '\u0DFF') total_chars = sum(1 for c in text if not c.isspace()) if total_chars == 0: return False return sinhala_chars / total_chars > 0.5 def get_coqui_characters_config() -> dict: """ Get character config dict for Coqui-TTS FastPitch. Usage: from TTS.tts.configs.shared_configs import CharactersConfig char_config = CharactersConfig(**get_coqui_characters_config()) """ tok = SinhalaTokenizer() return { "pad": SinhalaTokenizer.PAD, "eos": SinhalaTokenizer.EOS, "bos": SinhalaTokenizer.BOS, "blank": SinhalaTokenizer.BLANK, "characters": tok.get_characters_string(), "punctuations": tok.get_punctuations_string(), "phonemes": None, "is_unique": True, } # ============================================================ # CLI: test the tokenizer # ============================================================ if __name__ == "__main__": tok = SinhalaTokenizer() print(f"Tokenizer: {tok}") print(f"Vocab size: {tok.vocab_size}") print() # Test sentences test_texts = [ "ශ්‍රී ලංකාව", # Sri Lanka (with conjunct ශ්‍ර) "මෙය උදාහරණ වාක්‍යයකි.", # "This is an example sentence." "සිංහල භාෂාව ඉතා සුන්දරයි!", # "Sinhala language is very beautiful!" "ක්‍රිකට් ක්‍රීඩාව", # "Cricket sport" (conjuncts) "බුද්ධ ශාසනය", # "Buddhist dispensation" "123 දවස්", # "123 days" (mixed digits) ] for text in test_texts: print(f"Input: {text}") tokens = tok.tokenize(text) print(f"Aksharas: {tokens}") ids = tok.encode(text) print(f"IDs: {ids}") decoded = tok.decode(ids) print(f"Decoded: {decoded}") # Verify round-trip normalized = tok.normalize(text) match = "✓" if decoded == normalized else "✗" print(f"Match: {match}") print() # Print character set for Coqui config print("=" * 60) print("FOR COQUI-TTS CONFIG:") print("=" * 60) config = get_coqui_characters_config() print(f"characters: {repr(config['characters'])}") print(f"punctuations: {repr(config['punctuations'])}") print(f"Total unique characters: {len(config['characters'])}") # Save vocab vocab_path = "sinhala_vocab.json" tok.save_vocab(vocab_path) print(f"\nVocab saved to: {vocab_path}")