""" Sandhi Engine for Panini Tokenizer V4 Generates pre-sandhi hypotheses for Sanskrit compound splitting. Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation. Uses table-driven design for maintainability. """ from typing import List, Tuple, Generator class SandhiEngine: """ Generates pre-sandhi hypotheses for Sanskrit compound splitting. Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation. """ def __init__(self): # ac-sandhi (vowel merger) tables # Key = surface char, Value = list of (left_end, right_start) pairs self.VOWEL_SPLITS = { # Guṇa 'e': [('a', 'i'), ('A', 'i'), ('a', 'I'), ('A', 'I')], 'o': [('a', 'u'), ('A', 'u'), ('a', 'U'), ('A', 'U')], 'ar': [('a', 'f'), ('A', 'f'), ('a', 'F'), ('A', 'F')], # maharzi -> mahA + fzi # Vṛddhi 'E': [('a', 'e'), ('A', 'e'), ('a', 'E'), ('A', 'E')], # ai 'O': [('a', 'o'), ('A', 'o'), ('a', 'O'), ('A', 'O')], # au # Dīrgha (savarṇa dīrgha) - critical for long vowel restoration 'A': [('a', 'a'), ('a', 'A'), ('A', 'a'), ('A', 'A')], 'I': [('i', 'i'), ('i', 'I'), ('I', 'i'), ('I', 'I')], 'U': [('u', 'u'), ('u', 'U'), ('U', 'u'), ('U', 'U')], } # Consonant categories self.VOICED = set(['g', 'G', 'j', 'J', 'd', 'D', 'b', 'B', 'n', 'N', 'm', 'y', 'r', 'l', 'v', 'h']) self.HARD = set(['k', 'K', 'c', 'C', 't', 'T', 'w', 'W', 'p', 'P', 'S', 's']) def generate_splits(self, word: str, i: int) -> Generator[Tuple[str, str], None, None]: """ Yields (left, right) tuples for a split AT index i. i is the index of the character being considered as the 'pivot'. """ if i < 1 or i >= len(word): return char = word[i] # 1. Default: hard cut (no sandhi) # Split BEFORE char: word[:i] | word[i:] yield (word[:i], word[i:]) # 2. Vowel coalescence (the char IS the result of merger) # e.g. gaṇ[e]śa -> left ends with 'a', right starts with 'i' if char in self.VOWEL_SPLITS: for left_end, right_start in self.VOWEL_SPLITS[char]: # Replace char at i with the split pair yield (word[:i] + left_end, right_start + word[i+1:]) # 3. Yān sandhi (y -> i/I, v -> u/U) # e.g. praty[e]kam -> prati + ekam # CAUTION: Yān happens BEFORE a vowel, check word[i+1] if i + 1 < len(word): next_char = word[i+1] if char == 'y': # y -> i/I for v in ['i', 'I']: yield (word[:i] + v, word[i+1:]) elif char == 'v': # v -> u/U for v in ['u', 'U']: yield (word[:i] + v, word[i+1:]) # 4. Visarga sandhi restoration # 'o' before voiced consonant -> 'aH' if char == 'o' and i + 1 < len(word): if word[i+1] in self.VOICED: yield (word[:i] + "aH", word[i+1:]) # 'r' before voiced -> 'H' (punarjanma -> punaH + janma) if char == 'r' and i + 1 < len(word): if word[i+1] in self.VOICED: yield (word[:i] + "H", word[i+1:]) # 's'/'S' before hard consonant -> 'H' if char in ['s', 'S'] and i + 1 < len(word): if word[i+1] in self.HARD: yield (word[:i] + "H", word[i+1:]) # --- TEST --- if __name__ == "__main__": engine = SandhiEngine() print("Testing SandhiEngine...") test_cases = [ ("gaReSa", 3), # e: should yield gaRa + iSa ("devendra", 3), # e: should yield deva + indra ("rAmo", 3), # o: should yield rAmaH before voiced ("punarjanma", 4), # r: should yield punaH + janma ] for word, pos in test_cases: print(f"\n {word} at pos {pos}:") for left, right in engine.generate_splits(word, pos): print(f" {left} | {right}")