File size: 39,553 Bytes

"""

Samāsa (Compound) Splitter

Detects and splits Sanskrit compound words at their boundaries.

"""

from typing import List, Tuple, Optional
from dataclasses import dataclass

# Import analyzer for Kosha access (absolute imports for HF compatibility)
from analyzer import VidyutAnalyzer, MorphParse
from sandhi_engine import SandhiEngine


@dataclass
class CompoundSplit:
    """Result of compound splitting."""
    surface: str              # Original compound
    components: List[str]     # Split components
    split_points: List[int]   # Character positions of splits
    is_compound: bool         # Was this actually a compound?
    compound_type: Optional[str]  # tatpuruṣa, dvandva, bahuvrīhi, etc.


class SamasaSplitter:
    """

    Splits Sanskrit compound words (samāsa) at their boundaries.

    Uses Kosha lookups to validate potential split points.

    """
    
    # Common compound final elements (uttarapada patterns)
    COMPOUND_FINALS = [
        "kara", "kAra", "kArin", "kft", "kftya",
        "gata", "gati", "gamana",
        "ja", "jAta", "janman",
        "Da", "DAra", "DAraka", "DArin",
        "maya", "mat", "vat",
        "pati", "nATa", "ISvara", "adhipa",
        "Atman", "rUpa", "svarUpa",
        "pada", "pAduka",
        "stha", "sthita", "sthAna",
        "yukta", "hIna", "rahita",
        "priya", "rata", "ASrita",
        "vid", "jYa", "vadin", "pAla",
        "rAja", "indra", "deva", "loka", 
        "karziR", "AkarziRi","ISa",              # Loving/devoted
    ]
    
    # Common compound first elements (pūrvapada patterns)
    COMPOUND_INITIALS = [
        "mahA", "ati", "su", "dur", "sat", "a", "an",  # Prefixes
        "sarva", "viSva", "eka", "bahu",               # All/one/many
        "deva", "brahma", "Atma", "para",              # Divine/supreme
        "rAja", "mahI", "loka",                        # King/earth/world
        "hfd", "manas", "citta",                       # Heart/mind
        "padma", "kamala", "Ananda", "ISa",                            # Lotus
    ]
    
    # Hardcoded protection for high-frequency words that might be over-split
    COMMON_WORDS = {
        "namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
        "pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya","AkarziRi",
    }
    
    def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
        """Initialize with optional shared analyzer."""
        self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
        self.sandhi_engine = SandhiEngine()  # V4: Generative sandhi expansion
    
    # Sandhi reversal rules: (surface_ending, possible_original_endings)
    # These are common consonant/vowel Sandhi transformations to reverse
    SANDHI_REVERSIONS = {
        # Consonant Sandhi (final consonant before vowel)
        'd': ['t', 'd'],      # vidyud -> vidyut
        'g': ['k', 'g'],      # vAg -> vAk
        'b': ['p', 'b'],      # ap -> ab (water)
        'D': ['T', 'D'],      # 
        'j': ['c', 'j'],      #
        'z': ['s', 'z'],      # 
        # Vowel Sandhi (vowel combinations)
        'A': ['a', 'A'],      # a+a -> A
        'I': ['i', 'I'],      # i+i -> I
        'U': ['u', 'U'],      # u+u -> U
        'e': ['a', 'i'],      # a+i -> e
        'o': ['a', 'u'],      # a+u -> o
        'ai': ['a', 'e'],     # a+e -> ai
        'au': ['a', 'o'],     # a+o -> au
        # Consonant clusters
        'cC': ['t', 'c'],     # t+c -> cC
        'jj': ['d', 'j'],     # d+j -> jj
        'DD': ['D', 'D'],     #
        # Visarga Sandhi
        'o': ['aH'],          # aH + vowel -> o
        'ar': ['aH'],         # aH + r -> ar
    }
    
    def _try_sandhi_reversal(self, surface: str, min_stem_len: int = 3) -> List[str]:
        """

        Try to recover original stems from Sandhi-modified surface forms.

        Returns list of possible original forms, ordered by likelihood.

        """
        candidates = [surface]  # Original form is always a candidate
        
        # TRANSLITERATION NORMALIZATION (lowercase digraph → SLP1 single char)
        # This handles: bh→B, dh→D, gh→G, ph→P, th→T, kh→K, ch→C, jh→J
        TRANSLIT_MAP = [
            ('bh', 'B'), ('dh', 'D'), ('gh', 'G'), ('ph', 'P'),
            ('th', 'T'), ('kh', 'K'), ('ch', 'C'), ('jh', 'J'),
            ('Th', 'W'), ('Dh', 'Q'),  # Retroflex aspirates
        ]
        normalized = surface
        for digraph, single in TRANSLIT_MAP:
            normalized = normalized.replace(digraph, single)
        if normalized != surface:
            candidates.append(normalized)
        
        # Try consonant Sandhi at word boundary (last char)
        for form in [surface, normalized]:
            if len(form) >= min_stem_len and form[-1] in self.SANDHI_REVERSIONS:
                for original in self.SANDHI_REVERSIONS[form[-1]]:
                    candidate = form[:-1] + original
                    if candidate not in candidates:
                        candidates.append(candidate)
        
        # Try internal Sandhi (for compound-internal changes)
        # e.g., buddhy -> buddhi (y often represents elided i)
        for form in [surface, normalized]:
            if form.endswith('y') and len(form) >= min_stem_len:
                candidates.append(form[:-1] + 'i')  # Try y -> i
            if form.endswith('v') and len(form) >= min_stem_len:
                candidates.append(form[:-1] + 'u')  # Try v -> u
            
        # Remove duplicates while preserving order
        seen = set()
        unique = []
        for c in candidates:
            if c not in seen:
                seen.add(c)
                unique.append(c)
        
        return unique
    
    def _is_valid_stem(self, surface: str) -> bool:
        """

        Check if a surface form is a valid stem, trying:

        0. COMMON_WORDS protection

        1. Direct Kosha lookup

        2. Visarga/Anusvara base check (rAmaH → rAma)

        3. Sandhi reversal

        4. Pratyaya (suffix) stripping

        """
        if len(surface) < 2:
            return False
        
        # 0. Safety Check for Common Words (protect namaH, rAmo, etc.)
        if surface in self.COMMON_WORDS:
            return True
        
        # 1. Direct Kosha Check
        if self.analyzer._in_kosha(surface):
            return True
        
        # 2. Visarga/Anusvara Check (FIX for rAmaH validation)
        # If sandhi-restored "rAmo" → "rAmaH", accept it if base "rAma" is in kosha
        if surface.endswith('H') and len(surface) > 2:
            base = surface[:-1]
            if self.analyzer._in_kosha(base):
                return True
        if surface.endswith('M') and len(surface) > 2:
            base = surface[:-1]
            if self.analyzer._in_kosha(base):
                return True
        
        # 3. Try all Sandhi reversal candidates
        candidates = self._try_sandhi_reversal(surface)
        for candidate in candidates:
            if self.analyzer._in_kosha(candidate):
                return True
            # Also try vowel adjustments
            if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
                return True
            if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
                return True
            if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
                return True
            # Recursive visarga check for candidates too
            if candidate.endswith('H') and len(candidate) > 2:
                if self.analyzer._in_kosha(candidate[:-1]):
                    return True
        
        # Try VIBHAKTI STRIPPING (nominal case endings)
        VIBHAKTI_ENDINGS = [
            'am', 'aH', 'ena', 'Aya', 'At', 'asya', 'e', 'AH',  # Masculine a-stem
            'An', 'EH', 'eBya', 'AnAm', 'ezu',                   # Masculine a-stem plural
            'au', 'OH', 'AvyAm',                                  # Dual
            'aye',                                                 # i-stem dative (pataye, munaye)
            'ave',                                                 # u-stem dative (vizRave, gurave)
        ]
        for ending in sorted(VIBHAKTI_ENDINGS, key=len, reverse=True):
            if surface.endswith(ending) and len(surface) > len(ending) + 2:
                stem = surface[:-len(ending)]
                if self.analyzer._in_kosha(stem):
                    return True
                # Try with 'a' restoration (munipuMgavam → munipuMgava)
                if self.analyzer._in_kosha(stem + 'a'):
                    return True
                
                # SPECIAL CASE: 'aye' ending implies 'i' stem (pataye → pati)
                if ending == 'aye' and self.analyzer._in_kosha(stem + 'i'):
                    return True
                
                # SPECIAL CASE: 'ave' ending implies 'u' stem (gurave → guru)
                if ending == 'ave' and self.analyzer._in_kosha(stem + 'u'):
                    return True
        
        # Try PRATYAYA STRIPPING (grammatical suffix removal)
        # This is Panini's kRt/taddhita system - generalizes to ALL Sanskrit
        PRATYAYAS = [
            ('ana', 3),   # lyuT: action noun (karaNa from kR)
            ('Ana', 3),   # śānac: present participle
            ('tva', 3),   # tva: abstract noun (devatva from deva)
            ('tA', 2),    # tal: abstract noun (sundaratA)
            ('ya', 2),    # yat: fitness/gerundive
            ('ta', 2),    # kta: past participle
            ('ti', 2),    # ktin: action noun
            ('in', 2),    # ṇini: possessor
            ('ika', 3),   # ṭhak: related to
            ('Iya', 3),   # cha: related to
            # Feminine/agent kṛdanta suffixes (Fix 2)
            ('iRi', 3),   # iṇī: feminine agent (ākarṣiṇī)
            ('iRI', 3),   # iṇī: alt spelling
            ('inI', 3),   # inī: feminine possessor (yoginī)
            ('ikA', 3),   # ikā: feminine derivative (nāyikā)
            ('trI', 3),   # trī: feminine agent (kartrī)
        ]
        
        for suffix, min_root in PRATYAYAS:
            if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
                root = surface[:-len(suffix)]
                # Try the root in Kosha
                if self.analyzer._in_kosha(root):
                    return True
                # Try with guṇa 'a' restoration
                if self.analyzer._in_kosha(root + 'a'):
                    return True
                # Try R→f transliteration (MW uses f for ṛ: kartRI → kartf)
                root_f = root.replace('R', 'f')
                if root_f != root and self.analyzer._in_kosha(root_f):
                    return True
                # Try Sandhi reversal on root
                for r in self._try_sandhi_reversal(root):
                    if self.analyzer._in_kosha(r):
                        return True
        
        return False
    
    def _count_kosha_heads(self, surface: str, min_head_len: int = 5) -> int:
        """

        FIX 2: Count how many valid kosha stems exist inside a long string.

        Used to detect mega-tokens that swallowed multiple stems.

        """
        if len(surface) < min_head_len * 2:
            return 1 if self._is_valid_stem(surface) else 0
        
        heads = 0
        i = 0
        while i < len(surface) - min_head_len + 1:
            # Try to find a valid stem starting at position i
            for j in range(min(len(surface), i + 15), i + min_head_len - 1, -1):
                candidate = surface[i:j]
                if len(candidate) >= min_head_len and self._is_valid_stem(candidate):
                    heads += 1
                    i = j  # Skip past this head
                    break
            else:
                i += 1
        return max(heads, 1 if self._is_valid_stem(surface) else 0)
    
    def _is_krdanta(self, surface: str) -> bool:
        """

        FIX 3: Recognize kṛdanta (verbal derivative) forms.

        These should be kept as units, not split further.

        

        Kṛdanta indicators:

        - Ends with participial suffix preceded by verbal root

        - The whole form is in kosha as a recognized derivative

        """
        KRDANTA_SUFFIXES = [
            ('mAna', 4),   # Present participle (ātmanepada)
            ('Ana', 3),    # Present participle 
            ('tavat', 5),  # Past active participle
            ('ta', 2),     # Past passive participle (kta)
            ('in', 2),     # Agent noun (ṇini)
            ('aka', 3),    # Agent noun (ṇvul)
            ('tR', 2),     # Agent noun (tṛc)
        ]
        
        for suffix, min_root in KRDANTA_SUFFIXES:
            if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
                root = surface[:-len(suffix)]
                # Check if root looks like a valid verbal root
                # Valid roots are usually in kosha
                for candidate in self._try_sandhi_reversal(root):
                    if self.analyzer._in_kosha(candidate):
                        return True
        return False
    
    def _recursive_split(self, word: str, memo: dict = None) -> List[str]:
        """

        Recursively split a compound into maximal valid components.

        

        IMPROVED ALGORITHM with three fixes:

        1. FIX 1: Derivational spine continuation - keep collapsing if stem+suffix both valid

        2. FIX 2: Multi-head splitting - if token has multiple kosha heads, force split

        3. FIX 3: Kṛdanta recognition - keep participles as atomic units

        

        Uses memoization to avoid exponential blowup.

        """
        if memo is None:
            memo = {}
        
        if word in memo:
            return memo[word]
        
        # FIX 3: If it's a recognized kṛdanta, keep it atomic
        if self._is_krdanta(word) and self._is_valid_stem(word):
            memo[word] = [word]
            return [word]
        
        # FIX 2: Force split if token is long and contains multiple kosha heads
        MAX_TOKEN_LEN = 15  # Tokens longer than this that have multiple heads must split
        if len(word) > MAX_TOKEN_LEN:
            head_count = self._count_kosha_heads(word)
            if head_count > 1:
                # Don't return early - we MUST try to split this
                pass  # Continue to splitting logic
            else:
                # Single head or no heads - if valid, keep it
                if self._is_valid_stem(word):
                    memo[word] = [word]
                    return [word]
        else:
            # Base case: if word itself is valid AND not too long, return it
            if self._is_valid_stem(word):
                memo[word] = [word]
                return [word]
        
        # Base case: too short to split
        if len(word) < 4:
            memo[word] = [word]
            return [word]
        
        best_parse = [word]  # Default: no split
        best_score = -1000  # Start negative to ensure any valid split wins
        
        min_len = 3  # Minimum 3 chars to prevent rA, nA splits
        
        # Try all split points
        for i in range(min_len, len(word) - min_len + 1):
            left = word[:i]
            right = word[i:]
            
            # Check if left is valid (with Sandhi reversal)
            if self._is_valid_stem(left):
                # FIX 1: Derivational spine continuation
                # If left is a valid stem, check if left+next_suffix also forms a valid stem
                # This prevents over-splitting inside known words like bhAvanA
                spine_continued = False
                for ext_len in range(3, min(len(right) + 1, 8)):  # Try extending by 3-7 chars
                    extended = left + right[:ext_len]
                    if self._is_valid_stem(extended):
                        # The spine continues! Don't split here, try a longer left
                        spine_continued = True
                        break
                
                # Only split if spine doesn't continue OR if we're at a very long boundary
                if spine_continued and len(left) < 10:
                    continue  # Skip this split point, try longer
                
                # Recursively split the right side
                right_parse = self._recursive_split(right, memo)
                
                # Count valid components in this parse
                full_parse = [left] + right_parse
                valid_count = sum(1 for comp in full_parse if self._is_valid_stem(comp))
                
                # IMPROVED SCORING:
                # 1. Reward valid components heavily
                # 2. PENALIZE many components (prefer fewer, longer splits)
                # 3. PENALIZE short components (< 5 chars)
                # 4. REWARD if components are known kosha stems (not just valid via suffix)
                num_components = len(full_parse)
                avg_len = sum(len(c) for c in full_parse) / num_components
                short_penalty = sum(1 for c in full_parse if len(c) < 5)
                
                # Bonus for components that are DIRECTLY in kosha (not via suffix stripping)
                direct_kosha_bonus = sum(10 for c in full_parse 
                                         if self.analyzer._in_kosha(c) or 
                                         any(self.analyzer._in_kosha(x) for x in self._try_sandhi_reversal(c)))
                
                # Score formula: favor valid + long + few components + direct kosha
                score = (valid_count * 100  # Valid components matter most
                         - num_components * 15  # Penalize many splits (reduced from 20)
                         + avg_len * 5  # Reward longer components
                         - short_penalty * 40  # Penalize short fragments (reduced from 50)
                         + direct_kosha_bonus)  # Bonus for direct kosha stems
                
                if score > best_score:
                    best_score = score
                    best_parse = full_parse
        
        memo[word] = best_parse
        return best_parse
    
    def _longest_left_split(self, word: str) -> Optional[Tuple[str, str]]:
        """

        Find the longest valid left stem greedily WITH SANDHI REVERSAL.

        

        For unknown prefixes, tries consonant/vowel Sandhi reversions:

        - vidyud -> vidyut (d -> t before vowel)

        - buddhy -> buddhi (y -> i for elided vowel)

        """
        min_len = 3  # Minimum valid stem length
        
        # Scan from longest left to shortest
        for i in range(len(word) - min_len, min_len - 1, -1):
            left = word[:i]
            right = word[i:]
            
            # Try ALL Sandhi reversal candidates for left
            left_valid = False
            left_candidates = self._try_sandhi_reversal(left)
            for candidate in left_candidates:
                if self.analyzer._in_kosha(candidate):
                    left_valid = True
                    break
                # Also try with vowel adjustments
                if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
                    left_valid = True
                    break
                if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
                    left_valid = True
                    break
                if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
                    left_valid = True
                    break
            
            if left_valid and len(right) >= min_len:
                # Check if right is valid using Sandhi reversal
                right_valid = False
                right_candidates = self._try_sandhi_reversal(right)
                for candidate in right_candidates:
                    if self.analyzer._in_kosha(candidate):
                        right_valid = True
                        break
                    # Try with vowel adjustments
                    if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
                        right_valid = True
                        break
                
                # Try lookahead on right (for compound remainders)
                if not right_valid:
                    for j in range(min_len, min(len(right), 15)):
                        prefix = right[:j]
                        # Try all Sandhi reversals on the prefix
                        prefix_candidates = self._try_sandhi_reversal(prefix)
                        for candidate in prefix_candidates:
                            if self.analyzer._in_kosha(candidate):
                                right_valid = True
                                break
                            if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
                                right_valid = True
                                break
                        if right_valid:
                            break
                
                # Sandhi restoration: if left ended with long vowel, right may need prefix
                if not right_valid and left.endswith('A') and right[0] not in 'aAiIuUeEoO':
                    restored = 'A' + right
                    restored_candidates = self._try_sandhi_reversal(restored)
                    for candidate in restored_candidates:
                        if self.analyzer._in_kosha(candidate):
                            right_valid = True
                            break
                    if not right_valid:
                        for j in range(min_len, min(len(restored), 12)):
                            if self.analyzer._in_kosha(restored[:j]):
                                right_valid = True
                                break
                
                if right_valid:
                    return (left, right)
        
        return None
    
    def _find_split_candidates(self, word: str) -> List[int]:
        """Find potential split points based on stem cache validation."""
        candidates = []
        min_component = 2  # Minimum component length
        
        # Endings to strip when validating
        ENDINGS = ["M", "H", "aM", "am", "aH", "At", "ena", "Aya", "asya", 
                   "e", "O", "AnAm", "A", "I", "U", "AN", "An", "i"]
        
        for i in range(min_component, len(word) - min_component + 1):
            left = word[:i]
            right = word[i:]
            
            # Check left side (try as-is, then with vowel additions/normalization)
            left_valid = self.analyzer._in_kosha(left)
            if not left_valid:
                for suffix in ["a", "A", "i", "I", "u", "U"]:
                    if self.analyzer._in_kosha(left + suffix):
                        left_valid = True
                        break
            # Sandhi reversal: if left ends with long vowel, try normalizing
            if not left_valid and left.endswith('A'):
                if self.analyzer._in_kosha(left[:-1] + 'a'):
                    left_valid = True
            if not left_valid and left.endswith('I'):
                if self.analyzer._in_kosha(left[:-1] + 'i'):
                    left_valid = True
            if not left_valid and left.endswith('U'):
                if self.analyzer._in_kosha(left[:-1] + 'u'):
                    left_valid = True
            
            # Check right side (try as-is, strip endings, add vowels)
            right_valid = self.analyzer._in_kosha(right)
            if not right_valid:
                # Try stripping endings
                for ending in sorted(ENDINGS, key=len, reverse=True):
                    if right.endswith(ending) and len(right) > len(ending) + 1:
                        stripped = right[:-len(ending)]
                        if self.analyzer._in_kosha(stripped):
                            right_valid = True
                            break
                        # Also try with vowel additions
                        for suffix in ["a", "A"]:
                            if self.analyzer._in_kosha(stripped + suffix):
                                right_valid = True
                                break
                        if right_valid:
                            break
            
            if not right_valid:
                # Try vowel additions
                for suffix in ["a", "A", "i", "I"]:
                    if self.analyzer._in_kosha(right + suffix):
                        right_valid = True
                        break
            
            # Sandhi reversal for right side: if left ends with long vowel,
            # the vowel may have absorbed initial vowel of right.
            # Try restoring: AtmA|bhAsa -> check A+bhAsa = AbhAsa
            if not right_valid and len(right) > 2:
                # Check if left ends with long vowel that could have eaten something
                if left.endswith('A') and right[0] not in 'aAiIuUeEoO':
                    # Right starts with consonant - maybe initial A was eaten
                    restored = 'A' + right
                    if self.analyzer._in_kosha(restored):
                        right_valid = True
                    elif len(restored) > 3:
                        # Try lookahead on restored
                        for j in range(3, min(len(restored), 12)):
                            if self.analyzer._in_kosha(restored[:j]):
                                right_valid = True
                                break
                elif left.endswith('I') and right[0] not in 'aAiIuUeEoO':
                    restored = 'I' + right
                    if self.analyzer._in_kosha(restored):
                        right_valid = True
                elif left.endswith('U') and right[0] not in 'aAiIuUeEoO':
                    restored = 'U' + right
                    if self.analyzer._in_kosha(restored):
                        right_valid = True
            
            # Also check if right itself starts a sub-compound (Recursive Lookahead)
            if not right_valid and len(right) > 3:
                # Try to find ANY valid item at start of right
                # Check prefixes of length 3 to 12
                for j in range(3, min(len(right), 15)):
                    prefix = right[:j]
                    if self.analyzer._in_kosha(prefix):
                        right_valid = True
                        break
                    # Sandhi normalization: if prefix ends with long vowel, try short
                    # AtmA -> Atma, prAtI -> prAti, etc.
                    if prefix.endswith('A'):
                        normalized = prefix[:-1] + 'a'
                        if self.analyzer._in_kosha(normalized):
                            right_valid = True
                            break
                    elif prefix.endswith('I'):
                        normalized = prefix[:-1] + 'i'
                        if self.analyzer._in_kosha(normalized):
                            right_valid = True
                            break
                    elif prefix.endswith('U'):
                        normalized = prefix[:-1] + 'u'
                        if self.analyzer._in_kosha(normalized):
                            right_valid = True
                            break
                
                # If still not found, check known initials
                if not right_valid:
                    for initial in self.COMPOUND_INITIALS + list(self.COMPOUND_FINALS):
                        if right.startswith(initial) and len(initial) >= 2:
                            right_valid = True
                            break
            
            # DEBUG
            # if "sopAdhika" in word:
            #    print(f"Check {left} | {right} -> L:{left_valid} R:{right_valid}")

            if left_valid and right_valid:
                candidates.append(i)
        
        return candidates
    
    def score_split(components):
            # Base: Squared length favors fewer, longer components
            score = sum(len(c)**2 for c in components)
            
            # --- PENALTIES ---
            for c in components:
                if len(c) < 4:
                    if not self._is_valid_stem(c):
                        score -= 50
                    else:
                        score -= 5
            
            if len(components) > 2:
                score -= (len(components) - 2) * 20
            
            # --- BONUSES ---
            
            # 1. VALIDITY BONUS (THE FIX)
            # Old value: 30. New value: 100.
            # This ensures that 164 (split score) + 200 (bonus) > 289 (garbage score)
            valid_count = sum(1 for c in components if self._is_valid_stem(c))
            score += valid_count * 100  
            
            # 2. SURVIVAL BONUS (Protects rAmo, namaH)
            if len(components) == 1:
                if self._is_valid_stem(components[0]):
                    score += 50
            
            # 3. Compound Pattern Bonus
            if len(components) >= 2:
                left = components[0]
                right = components[-1]
                
                if left in self.COMPOUND_INITIALS: score += 15
                
                # Check Right Final
                r_stem, _ = self.analyzer._extract_vibhakti(right)
                if r_stem in self.COMPOUND_FINALS: score += 25
                elif right in self.COMPOUND_FINALS: score += 25
                
                if abs(len(left) - len(right)) <= 1: score += 10
            # 4. Expansion penalty (RELAXED)
            # We removed the "elif expansion == 0: score += 20" trap.
            total_len = sum(len(c) for c in components)
            expansion = total_len - len(word)
            if expansion > 1:
                score -= (expansion - 1) * 25
            return score
    
    def split(self, word: str, max_components: int = 4) -> CompoundSplit:
        """

        Split a compound word into its components.

        

        Uses greedy algorithm with Kosha validation.

        Returns original word if no valid split found.

        """
        if len(word) < 4:
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        # Check if word itself is in Kosha (might not be compound)
        # KEY FIX: If word is already a known stem (lexicalized), DO NOT SPLIT
        # This protects 'paramAtma', 'kzetrajYa', 'sopAdhika' from being broken down
        if self.analyzer._in_kosha(word):
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        # Use RECURSIVE COMPOSITIONAL algorithm
        # Tries ALL split points, recursively parses right sides,
        # returns parse with MOST valid components
        components = self._recursive_split(word)
        
        if len(components) <= 1:
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        # Calculate split points from components
        split_points = []
        pos = 0
        for comp in components[:-1]:
            pos += len(comp)
            split_points.append(pos)
        
        return CompoundSplit(
            surface=word, components=components,
            split_points=split_points, is_compound=True,
            compound_type=None  # We don't classify samāsa types
        )
    
    def _split_dp(self, word: str, memo: dict = None) -> List[List[str]]:
        """

        V4 Algorithm: Memoized Dynamic Programming with Sandhi Expansion.

        

        Returns all valid splits, cached by suffix.

        Handles coalescent sandhi (e=a+i, o=a+u, etc.) that V3 misses.

        """
        if memo is None:
            memo = {}
        
        if word in memo:
            return memo[word]
        
        # Base: too short to split
        if len(word) <= 2:
            if self._is_valid_stem(word):
                return [[word]]
            return []
        
        valid_splits = []

        # 1. OPTION A: The whole word is a stem (Lexicalized)
        if self._is_valid_stem(word):
            valid_splits.append([word])
            # DO NOT RETURN EARLY. Keep looking for splits!

        # 2. OPTION B: Split it (Generative Sandhi)
        # Try each split position with sandhi expansion
        for i in range(2, len(word) - 1):
            for left, right in self.sandhi_engine.generate_splits(word, i):
                if len(left) < 2 or len(right) < 2:
                    continue
                    
                if self._is_valid_stem(left):
                    # Recurse on right (memoized!)
                    right_splits = self._split_dp(right, memo)
                    for rs in right_splits:
                        valid_splits.append([left] + rs)
        
        memo[word] = valid_splits
        return valid_splits
    
    def split_v4(self, word: str) -> CompoundSplit:
        """

        V4 Split: Uses generative sandhi expansion for coalescent sandhi.

        

        Handles:

        - Vowel coalescence: gaṇeśa → gaṇa + īśa (e = a+i)

        - Visarga sandhi: punarjanma → punaH + janma

        - Vṛddhi: tavaiva → tava + eva

        """
        if len(word) < 4:
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        # Use V4 DP algorithm
        all_splits = self._split_dp(word)
        
        if not all_splits:
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        # SCORING STRATEGY:
        # Balance: prefer splits, but penalize over-fragmentation.
        # 1. Penalize short components (< 3 chars) heavily
        # 2. Prefer 2-component splits over 3+ components
        # 3. Single long tokens get moderate penalty
        # V4 Scoring with Compound Pattern Recognition
        def score_split(components):
            # Base: Squared length favors fewer, longer components
            score = sum(len(c)**2 for c in components)
            
            # --- PENALTIES ---
            # 1. Short junk penalty (unless it's a valid stem)
            for c in components:
                if len(c) < 4:
                    if not self._is_valid_stem(c):
                        score -= 50  # Garbage fragment
                    else:
                        score -= 5   # Valid but short (e.g. 'ISa'), slight penalty
            
            # 2. Fragmentation penalty
            if len(components) > 2:
                score -= (len(components) - 2) * 30  # Increased penalty
            
            # 3. 2-component bonus (optimal compound structure)
            if len(components) == 2:
                score += 25
            
            # --- BONUSES ---
            # 0. COMMON_WORDS Protection (namaH, rAmo should stay atomic)
            if len(components) == 1 and components[0] in self.COMMON_WORDS:
                score += 50  # Strong bonus to prevent splitting
            
            # 1. Validity Bonus (Crucial for pataye/rAmo)
            # Use _is_valid_stem so declined words get credit
            valid_count = sum(1 for c in components if self._is_valid_stem(c))
            score += valid_count * 30
            
            # 2. Compound Pattern Bonus (The Fix for gaRapataye)
            if len(components) >= 2:
                left = components[0]
                right = components[-1]
                
                # Check Left against Initials
                if left in self.COMPOUND_INITIALS:
                    score += 15
                
                # Check Right against Finals
                # Need to extract stem to match (pataye -> pati)
                for final in self.COMPOUND_FINALS:
                    if right.startswith(final) or right == final:
                        score += 25  # High bonus for matching pattern like 'pati'
                        break
                    # Try stripping vibhakti
                    if right.endswith('aye') and right[:-3] + 'i' == final:
                        score += 25
                        break
                    if right.endswith('ave') and right[:-3] + 'u' == final:
                        score += 25
                        break
                
                # Balance bonus
                if abs(len(left) - len(right)) <= 1:
                    score += 10
            
            # 4. Expansion penalty (sandhi artifacts add characters)
            # Allow 1 char expansion for sandhi (e → a+I), only penalize 2+ extra chars
            total_len = sum(len(c) for c in components)
            expansion = total_len - len(word)
            if expansion > 1:
                score -= (expansion - 1) * 25  # Stronger penalty
            elif expansion == 0:
                score += 20  # Bonus for exact-length splits (no sandhi artifact)

            return score

        best_split = max(all_splits, key=score_split)
        
        if len(best_split) <= 1:
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        return CompoundSplit(
            surface=word, components=best_split,
            split_points=[], is_compound=True, compound_type=None
        )
    
    def split_multiple(self, words: List[str]) -> List[CompoundSplit]:
        """Split multiple words."""
        return [self.split(w) for w in words]


# --- TEST ---
if __name__ == "__main__":
    print("Testing SamasaSplitter...")
    splitter = SamasaSplitter()
    
    test_compounds = [
        "hfdpadma",
        "paramAtma", 
        "mahArAja",
        "devadatta",
        "rAjakumAra",
        "sopAdhika",
    ]
    
    for word in test_compounds:
        result = splitter.split(word)
        if result.is_compound:
            print(f"  {word:20} → {' + '.join(result.components)}")
        else:
            print(f"  {word:20} → (not split)")