Spaces:
Runtime error
Runtime error
| """ | |
| Sandhi Engine for Panini Tokenizer V4 | |
| Generates pre-sandhi hypotheses for Sanskrit compound splitting. | |
| Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation. | |
| Uses table-driven design for maintainability. | |
| """ | |
| from typing import List, Tuple, Generator | |
| class SandhiEngine: | |
| """ | |
| Generates pre-sandhi hypotheses for Sanskrit compound splitting. | |
| Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation. | |
| """ | |
| def __init__(self): | |
| # ac-sandhi (vowel merger) tables | |
| # Key = surface char, Value = list of (left_end, right_start) pairs | |
| self.VOWEL_SPLITS = { | |
| # Guṇa | |
| 'e': [('a', 'i'), ('A', 'i'), ('a', 'I'), ('A', 'I')], | |
| 'o': [('a', 'u'), ('A', 'u'), ('a', 'U'), ('A', 'U')], | |
| 'ar': [('a', 'f'), ('A', 'f'), ('a', 'F'), ('A', 'F')], # maharzi -> mahA + fzi | |
| # Vṛddhi | |
| 'E': [('a', 'e'), ('A', 'e'), ('a', 'E'), ('A', 'E')], # ai | |
| 'O': [('a', 'o'), ('A', 'o'), ('a', 'O'), ('A', 'O')], # au | |
| # Dīrgha (savarṇa dīrgha) - critical for long vowel restoration | |
| 'A': [('a', 'a'), ('a', 'A'), ('A', 'a'), ('A', 'A')], | |
| 'I': [('i', 'i'), ('i', 'I'), ('I', 'i'), ('I', 'I')], | |
| 'U': [('u', 'u'), ('u', 'U'), ('U', 'u'), ('U', 'U')], | |
| } | |
| # Consonant categories | |
| self.VOICED = set(['g', 'G', 'j', 'J', 'd', 'D', 'b', 'B', 'n', 'N', 'm', 'y', 'r', 'l', 'v', 'h']) | |
| self.HARD = set(['k', 'K', 'c', 'C', 't', 'T', 'w', 'W', 'p', 'P', 'S', 's']) | |
| def generate_splits(self, word: str, i: int) -> Generator[Tuple[str, str], None, None]: | |
| """ | |
| Yields (left, right) tuples for a split AT index i. | |
| i is the index of the character being considered as the 'pivot'. | |
| """ | |
| if i < 1 or i >= len(word): | |
| return | |
| char = word[i] | |
| # 1. Default: hard cut (no sandhi) | |
| # Split BEFORE char: word[:i] | word[i:] | |
| yield (word[:i], word[i:]) | |
| # 2. Vowel coalescence (the char IS the result of merger) | |
| # e.g. gaṇ[e]śa -> left ends with 'a', right starts with 'i' | |
| if char in self.VOWEL_SPLITS: | |
| for left_end, right_start in self.VOWEL_SPLITS[char]: | |
| # Replace char at i with the split pair | |
| yield (word[:i] + left_end, right_start + word[i+1:]) | |
| # 3. Yān sandhi (y -> i/I, v -> u/U) | |
| # e.g. praty[e]kam -> prati + ekam | |
| # CAUTION: Yān happens BEFORE a vowel, check word[i+1] | |
| if i + 1 < len(word): | |
| next_char = word[i+1] | |
| if char == 'y': # y -> i/I | |
| for v in ['i', 'I']: | |
| yield (word[:i] + v, word[i+1:]) | |
| elif char == 'v': # v -> u/U | |
| for v in ['u', 'U']: | |
| yield (word[:i] + v, word[i+1:]) | |
| # 4. Visarga sandhi restoration | |
| # 'o' before voiced consonant -> 'aH' | |
| if char == 'o' and i + 1 < len(word): | |
| if word[i+1] in self.VOICED: | |
| yield (word[:i] + "aH", word[i+1:]) | |
| # 'r' before voiced -> 'H' (punarjanma -> punaH + janma) | |
| if char == 'r' and i + 1 < len(word): | |
| if word[i+1] in self.VOICED: | |
| yield (word[:i] + "H", word[i+1:]) | |
| # 's'/'S' before hard consonant -> 'H' | |
| if char in ['s', 'S'] and i + 1 < len(word): | |
| if word[i+1] in self.HARD: | |
| yield (word[:i] + "H", word[i+1:]) | |
| # --- TEST --- | |
| if __name__ == "__main__": | |
| engine = SandhiEngine() | |
| print("Testing SandhiEngine...") | |
| test_cases = [ | |
| ("gaReSa", 3), # e: should yield gaRa + iSa | |
| ("devendra", 3), # e: should yield deva + indra | |
| ("rAmo", 3), # o: should yield rAmaH before voiced | |
| ("punarjanma", 4), # r: should yield punaH + janma | |
| ] | |
| for word, pos in test_cases: | |
| print(f"\n {word} at pos {pos}:") | |
| for left, right in engine.generate_splits(word, pos): | |
| print(f" {left} | {right}") | |