|
|
"""
|
|
|
Sandhi Engine for Panini Tokenizer V4
|
|
|
Generates pre-sandhi hypotheses for Sanskrit compound splitting.
|
|
|
Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation.
|
|
|
|
|
|
Uses table-driven design for maintainability.
|
|
|
"""
|
|
|
|
|
|
from typing import List, Tuple, Generator
|
|
|
|
|
|
|
|
|
class SandhiEngine:
|
|
|
"""
|
|
|
Generates pre-sandhi hypotheses for Sanskrit compound splitting.
|
|
|
Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation.
|
|
|
"""
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
|
|
|
|
self.VOWEL_SPLITS = {
|
|
|
|
|
|
'e': [('a', 'i'), ('A', 'i'), ('a', 'I'), ('A', 'I')],
|
|
|
'o': [('a', 'u'), ('A', 'u'), ('a', 'U'), ('A', 'U')],
|
|
|
'ar': [('a', 'f'), ('A', 'f'), ('a', 'F'), ('A', 'F')],
|
|
|
|
|
|
|
|
|
'E': [('a', 'e'), ('A', 'e'), ('a', 'E'), ('A', 'E')],
|
|
|
'O': [('a', 'o'), ('A', 'o'), ('a', 'O'), ('A', 'O')],
|
|
|
|
|
|
|
|
|
'A': [('a', 'a'), ('a', 'A'), ('A', 'a'), ('A', 'A')],
|
|
|
'I': [('i', 'i'), ('i', 'I'), ('I', 'i'), ('I', 'I')],
|
|
|
'U': [('u', 'u'), ('u', 'U'), ('U', 'u'), ('U', 'U')],
|
|
|
}
|
|
|
|
|
|
|
|
|
self.VOICED = set(['g', 'G', 'j', 'J', 'd', 'D', 'b', 'B', 'n', 'N', 'm', 'y', 'r', 'l', 'v', 'h'])
|
|
|
self.HARD = set(['k', 'K', 'c', 'C', 't', 'T', 'w', 'W', 'p', 'P', 'S', 's'])
|
|
|
|
|
|
def generate_splits(self, word: str, i: int) -> Generator[Tuple[str, str], None, None]:
|
|
|
"""
|
|
|
Yields (left, right) tuples for a split AT index i.
|
|
|
i is the index of the character being considered as the 'pivot'.
|
|
|
"""
|
|
|
if i < 1 or i >= len(word):
|
|
|
return
|
|
|
|
|
|
char = word[i]
|
|
|
|
|
|
|
|
|
|
|
|
yield (word[:i], word[i:])
|
|
|
|
|
|
|
|
|
|
|
|
if char in self.VOWEL_SPLITS:
|
|
|
for left_end, right_start in self.VOWEL_SPLITS[char]:
|
|
|
|
|
|
yield (word[:i] + left_end, right_start + word[i+1:])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if i + 1 < len(word):
|
|
|
next_char = word[i+1]
|
|
|
if char == 'y':
|
|
|
for v in ['i', 'I']:
|
|
|
yield (word[:i] + v, word[i+1:])
|
|
|
elif char == 'v':
|
|
|
for v in ['u', 'U']:
|
|
|
yield (word[:i] + v, word[i+1:])
|
|
|
|
|
|
|
|
|
|
|
|
if char == 'o' and i + 1 < len(word):
|
|
|
if word[i+1] in self.VOICED:
|
|
|
yield (word[:i] + "aH", word[i+1:])
|
|
|
|
|
|
|
|
|
if char == 'r' and i + 1 < len(word):
|
|
|
if word[i+1] in self.VOICED:
|
|
|
yield (word[:i] + "H", word[i+1:])
|
|
|
|
|
|
|
|
|
if char in ['s', 'S'] and i + 1 < len(word):
|
|
|
if word[i+1] in self.HARD:
|
|
|
yield (word[:i] + "H", word[i+1:])
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
engine = SandhiEngine()
|
|
|
|
|
|
print("Testing SandhiEngine...")
|
|
|
|
|
|
test_cases = [
|
|
|
("gaReSa", 3),
|
|
|
("devendra", 3),
|
|
|
("rAmo", 3),
|
|
|
("punarjanma", 4),
|
|
|
]
|
|
|
|
|
|
for word, pos in test_cases:
|
|
|
print(f"\n {word} at pos {pos}:")
|
|
|
for left, right in engine.generate_splits(word, pos):
|
|
|
print(f" {left} | {right}")
|
|
|
|