File size: 4,282 Bytes

77111fb

"""

Sandhi Engine for Panini Tokenizer V4

Generates pre-sandhi hypotheses for Sanskrit compound splitting.

Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation.



Uses table-driven design for maintainability.

"""

from typing import List, Tuple, Generator


class SandhiEngine:
    """

    Generates pre-sandhi hypotheses for Sanskrit compound splitting.

    Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation.

    """
    
    def __init__(self):
        # ac-sandhi (vowel merger) tables
        # Key = surface char, Value = list of (left_end, right_start) pairs
        self.VOWEL_SPLITS = {
            # Guṇa
            'e':  [('a', 'i'), ('A', 'i'), ('a', 'I'), ('A', 'I')],
            'o':  [('a', 'u'), ('A', 'u'), ('a', 'U'), ('A', 'U')],
            'ar': [('a', 'f'), ('A', 'f'), ('a', 'F'), ('A', 'F')],  # maharzi -> mahA + fzi
            
            # Vṛddhi
            'E':  [('a', 'e'), ('A', 'e'), ('a', 'E'), ('A', 'E')],  # ai
            'O':  [('a', 'o'), ('A', 'o'), ('a', 'O'), ('A', 'O')],  # au
            
            # Dīrgha (savarṇa dīrgha) - critical for long vowel restoration
            'A':  [('a', 'a'), ('a', 'A'), ('A', 'a'), ('A', 'A')],
            'I':  [('i', 'i'), ('i', 'I'), ('I', 'i'), ('I', 'I')],
            'U':  [('u', 'u'), ('u', 'U'), ('U', 'u'), ('U', 'U')],
        }

        # Consonant categories
        self.VOICED = set(['g', 'G', 'j', 'J', 'd', 'D', 'b', 'B', 'n', 'N', 'm', 'y', 'r', 'l', 'v', 'h'])
        self.HARD = set(['k', 'K', 'c', 'C', 't', 'T', 'w', 'W', 'p', 'P', 'S', 's'])
    
    def generate_splits(self, word: str, i: int) -> Generator[Tuple[str, str], None, None]:
        """

        Yields (left, right) tuples for a split AT index i.

        i is the index of the character being considered as the 'pivot'.

        """
        if i < 1 or i >= len(word):
            return
        
        char = word[i]
        
        # 1. Default: hard cut (no sandhi)
        # Split BEFORE char: word[:i] | word[i:]
        yield (word[:i], word[i:])
        
        # 2. Vowel coalescence (the char IS the result of merger)
        # e.g. gaṇ[e]śa -> left ends with 'a', right starts with 'i'
        if char in self.VOWEL_SPLITS:
            for left_end, right_start in self.VOWEL_SPLITS[char]:
                # Replace char at i with the split pair
                yield (word[:i] + left_end, right_start + word[i+1:])
                
        # 3. Yān sandhi (y -> i/I, v -> u/U)
        # e.g. praty[e]kam -> prati + ekam
        # CAUTION: Yān happens BEFORE a vowel, check word[i+1]
        if i + 1 < len(word):
            next_char = word[i+1]
            if char == 'y':  # y -> i/I
                for v in ['i', 'I']:
                    yield (word[:i] + v, word[i+1:])
            elif char == 'v':  # v -> u/U
                for v in ['u', 'U']:
                    yield (word[:i] + v, word[i+1:])
                    
        # 4. Visarga sandhi restoration
        # 'o' before voiced consonant -> 'aH'
        if char == 'o' and i + 1 < len(word):
            if word[i+1] in self.VOICED:
                yield (word[:i] + "aH", word[i+1:])
                
        # 'r' before voiced -> 'H' (punarjanma -> punaH + janma)
        if char == 'r' and i + 1 < len(word):
            if word[i+1] in self.VOICED:
                yield (word[:i] + "H", word[i+1:])

        # 's'/'S' before hard consonant -> 'H'
        if char in ['s', 'S'] and i + 1 < len(word):
            if word[i+1] in self.HARD:
                yield (word[:i] + "H", word[i+1:])


# --- TEST ---
if __name__ == "__main__":
    engine = SandhiEngine()
    
    print("Testing SandhiEngine...")
    
    test_cases = [
        ("gaReSa", 3),   # e: should yield gaRa + iSa
        ("devendra", 3), # e: should yield deva + indra  
        ("rAmo", 3),     # o: should yield rAmaH before voiced
        ("punarjanma", 4), # r: should yield punaH + janma
    ]
    
    for word, pos in test_cases:
        print(f"\n  {word} at pos {pos}:")
        for left, right in engine.generate_splits(word, pos):
            print(f"    {left} | {right}")