panini-tokenizer / src /sandhi_engine.py
ArthaLabs's picture
Upload folder using huggingface_hub
77111fb verified
"""
Sandhi Engine for Panini Tokenizer V4
Generates pre-sandhi hypotheses for Sanskrit compound splitting.
Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation.
Uses table-driven design for maintainability.
"""
from typing import List, Tuple, Generator
class SandhiEngine:
"""
Generates pre-sandhi hypotheses for Sanskrit compound splitting.
Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation.
"""
def __init__(self):
# ac-sandhi (vowel merger) tables
# Key = surface char, Value = list of (left_end, right_start) pairs
self.VOWEL_SPLITS = {
# Guṇa
'e': [('a', 'i'), ('A', 'i'), ('a', 'I'), ('A', 'I')],
'o': [('a', 'u'), ('A', 'u'), ('a', 'U'), ('A', 'U')],
'ar': [('a', 'f'), ('A', 'f'), ('a', 'F'), ('A', 'F')], # maharzi -> mahA + fzi
# Vṛddhi
'E': [('a', 'e'), ('A', 'e'), ('a', 'E'), ('A', 'E')], # ai
'O': [('a', 'o'), ('A', 'o'), ('a', 'O'), ('A', 'O')], # au
# Dīrgha (savarṇa dīrgha) - critical for long vowel restoration
'A': [('a', 'a'), ('a', 'A'), ('A', 'a'), ('A', 'A')],
'I': [('i', 'i'), ('i', 'I'), ('I', 'i'), ('I', 'I')],
'U': [('u', 'u'), ('u', 'U'), ('U', 'u'), ('U', 'U')],
}
# Consonant categories
self.VOICED = set(['g', 'G', 'j', 'J', 'd', 'D', 'b', 'B', 'n', 'N', 'm', 'y', 'r', 'l', 'v', 'h'])
self.HARD = set(['k', 'K', 'c', 'C', 't', 'T', 'w', 'W', 'p', 'P', 'S', 's'])
def generate_splits(self, word: str, i: int) -> Generator[Tuple[str, str], None, None]:
"""
Yields (left, right) tuples for a split AT index i.
i is the index of the character being considered as the 'pivot'.
"""
if i < 1 or i >= len(word):
return
char = word[i]
# 1. Default: hard cut (no sandhi)
# Split BEFORE char: word[:i] | word[i:]
yield (word[:i], word[i:])
# 2. Vowel coalescence (the char IS the result of merger)
# e.g. gaṇ[e]śa -> left ends with 'a', right starts with 'i'
if char in self.VOWEL_SPLITS:
for left_end, right_start in self.VOWEL_SPLITS[char]:
# Replace char at i with the split pair
yield (word[:i] + left_end, right_start + word[i+1:])
# 3. Yān sandhi (y -> i/I, v -> u/U)
# e.g. praty[e]kam -> prati + ekam
# CAUTION: Yān happens BEFORE a vowel, check word[i+1]
if i + 1 < len(word):
next_char = word[i+1]
if char == 'y': # y -> i/I
for v in ['i', 'I']:
yield (word[:i] + v, word[i+1:])
elif char == 'v': # v -> u/U
for v in ['u', 'U']:
yield (word[:i] + v, word[i+1:])
# 4. Visarga sandhi restoration
# 'o' before voiced consonant -> 'aH'
if char == 'o' and i + 1 < len(word):
if word[i+1] in self.VOICED:
yield (word[:i] + "aH", word[i+1:])
# 'r' before voiced -> 'H' (punarjanma -> punaH + janma)
if char == 'r' and i + 1 < len(word):
if word[i+1] in self.VOICED:
yield (word[:i] + "H", word[i+1:])
# 's'/'S' before hard consonant -> 'H'
if char in ['s', 'S'] and i + 1 < len(word):
if word[i+1] in self.HARD:
yield (word[:i] + "H", word[i+1:])
# --- TEST ---
if __name__ == "__main__":
engine = SandhiEngine()
print("Testing SandhiEngine...")
test_cases = [
("gaReSa", 3), # e: should yield gaRa + iSa
("devendra", 3), # e: should yield deva + indra
("rAmo", 3), # o: should yield rAmaH before voiced
("punarjanma", 4), # r: should yield punaH + janma
]
for word, pos in test_cases:
print(f"\n {word} at pos {pos}:")
for left, right in engine.generate_splits(word, pos):
print(f" {left} | {right}")