#!/usr/bin/env python3
"""
=============================================================
Sinhala Grapheme Tokenizer for TTS
=============================================================
Tokenizes Sinhala text into orthographic syllables (aksharas)
for FastPitch character-level TTS training.

Sinhala is a Brahmic abugida (U+0D80–U+0DFF):
  - Consonants carry an inherent /a/ vowel
  - Al-lakuna (virama) ් suppresses the inherent vowel
  - ZWJ (U+200D) forms conjunct consonants: ක් + ZWJ + ෂ → ක්‍ෂ
  - Vowel diacritics modify the inherent vowel

An akshara (grapheme cluster) is the minimal unit:
  - Independent vowel: අ, ආ, ඉ, ඊ, ...
  - Consonant + optional vowel sign: ක, කා, කි, ...
  - Conjunct: consonant + virama + ZWJ + consonant + vowel sign

Why aksharas (not phonemes):
  IndicTTS (arxiv:2211.09536) proved character-level input works
  for all 13 Indic languages tested. No phoneme dictionary needed.

Usage:
    from sinhala_tokenizer import SinhalaTokenizer
    tok = SinhalaTokenizer()
    tokens = tok.tokenize("ශ්‍රී ලංකාව")
    ids = tok.encode("ශ්‍රී ලංකාව")
    text = tok.decode(ids)

=============================================================
"""

import json
import re
import unicodedata
from pathlib import Path
from typing import Dict, List, Optional, Tuple


# ============================================================
# Unicode Constants for Sinhala (U+0D80–U+0DFF)
# ============================================================

# Independent vowels (can stand alone)
SINHALA_VOWELS = (
    '\u0D85'  # අ (a)
    '\u0D86'  # ආ (aa)
    '\u0D87'  # ඇ (ae)
    '\u0D88'  # ඈ (aae)
    '\u0D89'  # ඉ (i)
    '\u0D8A'  # ඊ (ii)
    '\u0D8B'  # උ (u)
    '\u0D8C'  # ඌ (uu)
    '\u0D8D'  # ඍ (ri)
    '\u0D8E'  # ඎ (rii) - rare
    '\u0D8F'  # ඏ (li) - rare
    '\u0D90'  # ඐ (lii) - rare
    '\u0D91'  # එ (e)
    '\u0D92'  # ඒ (ee)
    '\u0D93'  # ඓ (ai)
    '\u0D94'  # ඔ (o)
    '\u0D95'  # ඕ (oo)
    '\u0D96'  # ඖ (au)
)

# Consonants
SINHALA_CONSONANTS = (
    '\u0D9A'  # ක (ka)
    '\u0D9B'  # ඛ (kha)
    '\u0D9C'  # ග (ga)
    '\u0D9D'  # ඝ (gha)
    '\u0D9E'  # ඞ (nga)
    '\u0D9F'  # ඟ (nnga) - sanyaka (prenasalized)
    '\u0DA0'  # ච (cha)
    '\u0DA1'  # ඡ (chha)
    '\u0DA2'  # ජ (ja)
    '\u0DA3'  # ඣ (jha)
    '\u0DA4'  # ඤ (nya)
    '\u0DA5'  # ඥ (jnya)
    '\u0DA6'  # ඦ (nyja) - sanyaka
    '\u0DA7'  # ට (tta)
    '\u0DA8'  # ඨ (ttha)
    '\u0DA9'  # ඩ (dda)
    '\u0DAA'  # ඪ (ddha)
    '\u0DAB'  # ණ (nna)
    '\u0DAC'  # ඬ (ndda) - sanyaka
    '\u0DAD'  # ත (ta)
    '\u0DAE'  # ථ (tha)
    '\u0DAF'  # ද (da)
    '\u0DB0'  # ධ (dha)
    '\u0DB1'  # න (na)
    '\u0DB3'  # ඳ (nda) - sanyaka
    '\u0DB4'  # ප (pa)
    '\u0DB5'  # ඵ (pha)
    '\u0DB6'  # බ (ba)
    '\u0DB7'  # භ (bha)
    '\u0DB8'  # ම (ma)
    '\u0DB9'  # ඹ (mba) - sanyaka
    '\u0DBA'  # ය (ya)
    '\u0DBB'  # ර (ra)
    '\u0DBD'  # ල (la)
    '\u0DC0'  # ව (va/wa)
    '\u0DC1'  # ශ (sha)
    '\u0DC2'  # ෂ (ssa)
    '\u0DC3'  # ස (sa)
    '\u0DC4'  # හ (ha)
    '\u0DC5'  # ළ (lla)
    '\u0DC6'  # ෆ (fa) - used for foreign words
)

# Virama (Al-Lakuna) — kills inherent vowel
VIRAMA = '\u0DCA'  # ්

# Zero-Width Joiner — forms visual conjuncts
ZWJ = '\u200D'

# Dependent vowel signs (diacritics on consonants)
SINHALA_VOWEL_SIGNS = (
    '\u0DCF'  # ා (aa)
    '\u0DD0'  # ැ (ae)
    '\u0DD1'  # ෑ (aae)
    '\u0DD2'  # ි (i)
    '\u0DD3'  # ී (ii)
    '\u0DD4'  # ු (u)
    '\u0DD6'  # ූ (uu)
    '\u0DD8'  # ෘ (ri)
    '\u0DD9'  # ෙ (e) — pre-base
    '\u0DDA'  # ේ (ee)
    '\u0DDB'  # ෛ (ai)
    '\u0DDC'  # ො (o)
    '\u0DDD'  # ෝ (oo)
    '\u0DDE'  # ෞ (au)
    '\u0DDF'  # ෟ (li) — rare
)

# Anusvara and Visarga
ANUSVARA = '\u0D82'  # ං
VISARGA = '\u0D83'   # ඃ

# Sinhala digits
SINHALA_DIGITS = ''.join(chr(c) for c in range(0x0DE6, 0x0DF0))  # ෦-෯

# Sets for fast lookup
VOWEL_SET = set(SINHALA_VOWELS)
CONSONANT_SET = set(SINHALA_CONSONANTS)
VOWEL_SIGN_SET = set(SINHALA_VOWEL_SIGNS)
MODIFIER_SET = VOWEL_SIGN_SET | {ANUSVARA, VISARGA, VIRAMA}


# ============================================================
# Regex pattern for Sinhala grapheme clusters (aksharas)
# ============================================================
# An akshara is:
#   (Consonant (Virama ZWJ? Consonant)*)  VowelSign?  (Anusvara|Visarga)?
#   | IndependentVowel (Anusvara|Visarga)?
#   | Anusvara | Visarga  (standalone)

_C = f'[{"".join(SINHALA_CONSONANTS)}]'   # consonant
_V = f'[{"".join(SINHALA_VOWELS)}]'       # independent vowel
_VS = f'[{"".join(SINHALA_VOWEL_SIGNS)}]' # vowel sign
_VIR = re.escape(VIRAMA)
_ZWJ = re.escape(ZWJ)
_ANU = re.escape(ANUSVARA)
_VIS = re.escape(VISARGA)

# Conjunct consonant: virama + optional ZWJ + consonant
_CONJUNCT = f'{_VIR}{_ZWJ}?{_C}'

# Full akshara pattern
AKSHARA_PATTERN = re.compile(
    f'({_C}(?:{_CONJUNCT})*{_VS}?[{re.escape(ANUSVARA)}{re.escape(VISARGA)}]?)'  # consonant cluster
    f'|({_V}[{re.escape(ANUSVARA)}{re.escape(VISARGA)}]?)'                         # independent vowel
    f'|([{re.escape(ANUSVARA)}{re.escape(VISARGA)}])'                              # standalone modifier
    f'|([0-9{SINHALA_DIGITS}]+)'                                                    # number
    f'|([!?.,;:\'"\\-–—…])'                                                        # punctuation
    f'|( +)'                                                                        # space(s)
    f'|(.)',                                                                        # anything else (1 char)
    re.UNICODE
)


class SinhalaTokenizer:
    """
    Sinhala grapheme (akshara) tokenizer for TTS.

    Designed for Coqui-TTS FastPitch training.
    Maps Sinhala text → grapheme cluster sequence → integer IDs.
    """

    # Special tokens
    PAD = "<PAD>"
    BOS = "<BOS>"
    EOS = "<EOS>"
    UNK = "<UNK>"
    BLANK = "<BLNK>"
    SPACE = " "

    def __init__(self, vocab_path: Optional[str] = None):
        """
        Initialize tokenizer.

        If vocab_path is provided, load existing vocabulary.
        Otherwise, build default vocabulary from Unicode block.
        """
        if vocab_path and Path(vocab_path).exists():
            self.load_vocab(vocab_path)
        else:
            self._build_default_vocab()

    def _build_default_vocab(self):
        """Build vocabulary from Sinhala Unicode block + common tokens."""
        self.token2id: Dict[str, int] = {}
        self.id2token: Dict[int, str] = {}

        # Special tokens first (indices 0-4)
        specials = [self.PAD, self.BOS, self.EOS, self.UNK, self.BLANK]
        for i, tok in enumerate(specials):
            self.token2id[tok] = i
            self.id2token[i] = tok

        idx = len(specials)

        # Space
        self.token2id[self.SPACE] = idx
        self.id2token[idx] = self.SPACE
        idx += 1

        # Punctuation
        for p in '!?.,;:\'"\\-':
            self.token2id[p] = idx
            self.id2token[idx] = p
            idx += 1

        # Independent vowels
        for v in SINHALA_VOWELS:
            self.token2id[v] = idx
            self.id2token[idx] = v
            idx += 1

        # Consonants
        for c in SINHALA_CONSONANTS:
            self.token2id[c] = idx
            self.id2token[idx] = c
            idx += 1

        # Vowel signs
        for vs in SINHALA_VOWEL_SIGNS:
            self.token2id[vs] = idx
            self.id2token[idx] = vs
            idx += 1

        # Modifiers
        for m in [VIRAMA, ZWJ, ANUSVARA, VISARGA]:
            if m not in self.token2id:
                self.token2id[m] = idx
                self.id2token[idx] = m
                idx += 1

        # Arabic digits
        for d in '0123456789':
            if d not in self.token2id:
                self.token2id[d] = idx
                self.id2token[idx] = d
                idx += 1

        # Sinhala digits
        for d in SINHALA_DIGITS:
            if d not in self.token2id:
                self.token2id[d] = idx
                self.id2token[idx] = d
                idx += 1

        self.vocab_size = len(self.token2id)

    def normalize(self, text: str) -> str:
        """
        Normalize Sinhala text for TTS.

        - NFC normalization
        - Remove ZWNJ (keep ZWJ)
        - Normalize punctuation
        - Collapse whitespace
        """
        text = unicodedata.normalize('NFC', text)
        text = text.replace('\u200C', '')  # Remove ZWNJ
        text = text.replace('"', '"').replace('"', '"')
        text = text.replace(''', "'").replace(''', "'")
        text = text.replace(';', ',').replace(':', ',')
        text = text.replace('(', '').replace(')', '')
        text = ' '.join(text.split())
        return text.strip()

    def tokenize(self, text: str, normalize: bool = True) -> List[str]:
        """
        Tokenize text into grapheme clusters.

        Returns list of tokens (aksharas, punctuation, spaces).
        """
        if normalize:
            text = self.normalize(text)

        tokens = []
        for match in AKSHARA_PATTERN.finditer(text):
            token = match.group(0)
            if token:
                tokens.append(token)

        return tokens

    def encode(self, text: str, add_bos: bool = True, add_eos: bool = True) -> List[int]:
        """
        Encode text to integer IDs.

        For multi-codepoint aksharas (conjuncts), each codepoint gets its own ID.
        This is by design — FastPitch's character embedding handles the sequence,
        and the attention/aligner learns the mapping to mel frames.
        """
        if isinstance(text, str):
            text = self.normalize(text)

        ids = []
        if add_bos:
            ids.append(self.token2id[self.BOS])

        for char in text:
            if char in self.token2id:
                ids.append(self.token2id[char])
            else:
                ids.append(self.token2id[self.UNK])

        if add_eos:
            ids.append(self.token2id[self.EOS])

        return ids

    def decode(self, ids: List[int], strip_special: bool = True) -> str:
        """Decode integer IDs back to text."""
        specials = {self.PAD, self.BOS, self.EOS, self.BLANK, self.UNK}
        chars = []
        for idx in ids:
            token = self.id2token.get(idx, self.UNK)
            if strip_special and token in specials:
                continue
            chars.append(token)
        return ''.join(chars)

    def get_characters_string(self) -> str:
        """
        Get all characters as a single string for Coqui-TTS config.

        Usage in Coqui config:
            config.characters.characters = tokenizer.get_characters_string()
        """
        # All Sinhala characters except specials, space, and punctuation
        chars = []
        for token, idx in sorted(self.token2id.items(), key=lambda x: x[1]):
            if token in {self.PAD, self.BOS, self.EOS, self.UNK, self.BLANK}:
                continue
            if token == self.SPACE:
                continue
            if token in '!?.,;:\'"\\-':
                continue
            if len(token) == 1:  # single characters only
                chars.append(token)
        return ''.join(chars)

    def get_punctuations_string(self) -> str:
        """Get punctuation characters for Coqui-TTS config."""
        return '!?.,;:\'"- '

    def save_vocab(self, path: str):
        """Save vocabulary to JSON."""
        data = {
            "token2id": self.token2id,
            "vocab_size": self.vocab_size,
            "description": "Sinhala TTS grapheme tokenizer vocabulary",
        }
        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

    def load_vocab(self, path: str):
        """Load vocabulary from JSON."""
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        self.token2id = data["token2id"]
        self.id2token = {int(v): k for k, v in self.token2id.items()}
        self.vocab_size = len(self.token2id)

    def expand_vocab(self, new_tokens: List[str]):
        """Add new tokens to vocabulary (e.g., from training data)."""
        for token in new_tokens:
            if token not in self.token2id:
                idx = len(self.token2id)
                self.token2id[token] = idx
                self.id2token[idx] = token
        self.vocab_size = len(self.token2id)

    def build_vocab_from_corpus(self, texts: List[str]):
        """
        Scan corpus and add any characters not in default vocab.

        Call this after loading your training data to ensure
        all characters are covered.
        """
        unseen = set()
        for text in texts:
            text = self.normalize(text)
            for char in text:
                if char not in self.token2id:
                    unseen.add(char)

        if unseen:
            print(f"Adding {len(unseen)} new characters to vocab: {unseen}")
            self.expand_vocab(sorted(unseen))

    def __len__(self):
        return self.vocab_size

    def __repr__(self):
        return (f"SinhalaTokenizer(vocab_size={self.vocab_size}, "
                f"vowels={len(SINHALA_VOWELS)}, "
                f"consonants={len(SINHALA_CONSONANTS)}, "
                f"vowel_signs={len(SINHALA_VOWEL_SIGNS)})")


# ============================================================
# Utility functions
# ============================================================

def syllabify(text: str) -> List[str]:
    """
    Quick syllabification: split Sinhala text into aksharas.

    Standalone function (no tokenizer instance needed).
    """
    text = unicodedata.normalize('NFC', text)
    syllables = []
    for match in AKSHARA_PATTERN.finditer(text):
        token = match.group(0)
        if token and token.strip():
            syllables.append(token)
    return syllables


def count_aksharas(text: str) -> int:
    """Count number of aksharas (grapheme clusters) in text."""
    return len(syllabify(text))


def is_sinhala(text: str) -> bool:
    """Check if text is predominantly Sinhala."""
    sinhala_chars = sum(1 for c in text if '\u0D80' <= c <= '\u0DFF')
    total_chars = sum(1 for c in text if not c.isspace())
    if total_chars == 0:
        return False
    return sinhala_chars / total_chars > 0.5


def get_coqui_characters_config() -> dict:
    """
    Get character config dict for Coqui-TTS FastPitch.

    Usage:
        from TTS.tts.configs.shared_configs import CharactersConfig
        char_config = CharactersConfig(**get_coqui_characters_config())
    """
    tok = SinhalaTokenizer()
    return {
        "pad": SinhalaTokenizer.PAD,
        "eos": SinhalaTokenizer.EOS,
        "bos": SinhalaTokenizer.BOS,
        "blank": SinhalaTokenizer.BLANK,
        "characters": tok.get_characters_string(),
        "punctuations": tok.get_punctuations_string(),
        "phonemes": None,
        "is_unique": True,
    }


# ============================================================
# CLI: test the tokenizer
# ============================================================
if __name__ == "__main__":
    tok = SinhalaTokenizer()
    print(f"Tokenizer: {tok}")
    print(f"Vocab size: {tok.vocab_size}")
    print()

    # Test sentences
    test_texts = [
        "ශ්‍රී ලංකාව",                    # Sri Lanka (with conjunct ශ්‍ර)
        "මෙය උදාහරණ වාක්‍යයකි.",          # "This is an example sentence."
        "සිංහල භාෂාව ඉතා සුන්දරයි!",    # "Sinhala language is very beautiful!"
        "ක්‍රිකට් ක්‍රීඩාව",                # "Cricket sport" (conjuncts)
        "බුද්ධ ශාසනය",                     # "Buddhist dispensation"
        "123 දවස්",                        # "123 days" (mixed digits)
    ]

    for text in test_texts:
        print(f"Input:    {text}")

        tokens = tok.tokenize(text)
        print(f"Aksharas: {tokens}")

        ids = tok.encode(text)
        print(f"IDs:      {ids}")

        decoded = tok.decode(ids)
        print(f"Decoded:  {decoded}")

        # Verify round-trip
        normalized = tok.normalize(text)
        match = "✓" if decoded == normalized else "✗"
        print(f"Match:    {match}")
        print()

    # Print character set for Coqui config
    print("=" * 60)
    print("FOR COQUI-TTS CONFIG:")
    print("=" * 60)
    config = get_coqui_characters_config()
    print(f"characters: {repr(config['characters'])}")
    print(f"punctuations: {repr(config['punctuations'])}")
    print(f"Total unique characters: {len(config['characters'])}")

    # Save vocab
    vocab_path = "sinhala_vocab.json"
    tok.save_vocab(vocab_path)
    print(f"\nVocab saved to: {vocab_path}")