Upload src/splitter.py with huggingface_hub

e1cd59b verified 8 days ago

39.6 kB

	"""
	Samāsa (Compound) Splitter
	Detects and splits Sanskrit compound words at their boundaries.
	"""

	from typing import List, Tuple, Optional
	from dataclasses import dataclass

	# Import analyzer for Kosha access (absolute imports for HF compatibility)
	from analyzer import VidyutAnalyzer, MorphParse
	from sandhi_engine import SandhiEngine


	@dataclass
	class CompoundSplit:
	"""Result of compound splitting."""
	surface: str # Original compound
	components: List[str] # Split components
	split_points: List[int] # Character positions of splits
	is_compound: bool # Was this actually a compound?
	compound_type: Optional[str] # tatpuruṣa, dvandva, bahuvrīhi, etc.


	class SamasaSplitter:
	"""
	Splits Sanskrit compound words (samāsa) at their boundaries.
	Uses Kosha lookups to validate potential split points.
	"""

	# Common compound final elements (uttarapada patterns)
	COMPOUND_FINALS = [
	"kara", "kAra", "kArin", "kft", "kftya",
	"gata", "gati", "gamana",
	"ja", "jAta", "janman",
	"Da", "DAra", "DAraka", "DArin",
	"maya", "mat", "vat",
	"pati", "nATa", "ISvara", "adhipa",
	"Atman", "rUpa", "svarUpa",
	"pada", "pAduka",
	"stha", "sthita", "sthAna",
	"yukta", "hIna", "rahita",
	"priya", "rata", "ASrita",
	"vid", "jYa", "vadin", "pAla",
	"rAja", "indra", "deva", "loka",
	"karziR", "AkarziRi","ISa", # Loving/devoted
	]

	# Common compound first elements (pūrvapada patterns)
	COMPOUND_INITIALS = [
	"mahA", "ati", "su", "dur", "sat", "a", "an", # Prefixes
	"sarva", "viSva", "eka", "bahu", # All/one/many
	"deva", "brahma", "Atma", "para", # Divine/supreme
	"rAja", "mahI", "loka", # King/earth/world
	"hfd", "manas", "citta", # Heart/mind
	"padma", "kamala", "Ananda", "ISa", # Lotus
	]

	# Hardcoded protection for high-frequency words that might be over-split
	COMMON_WORDS = {
	"namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
	"pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya","AkarziRi",
	}

	def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
	"""Initialize with optional shared analyzer."""
	self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
	self.sandhi_engine = SandhiEngine() # V4: Generative sandhi expansion

	# Sandhi reversal rules: (surface_ending, possible_original_endings)
	# These are common consonant/vowel Sandhi transformations to reverse
	SANDHI_REVERSIONS = {
	# Consonant Sandhi (final consonant before vowel)
	'd': ['t', 'd'], # vidyud -> vidyut
	'g': ['k', 'g'], # vAg -> vAk
	'b': ['p', 'b'], # ap -> ab (water)
	'D': ['T', 'D'], #
	'j': ['c', 'j'], #
	'z': ['s', 'z'], #
	# Vowel Sandhi (vowel combinations)
	'A': ['a', 'A'], # a+a -> A
	'I': ['i', 'I'], # i+i -> I
	'U': ['u', 'U'], # u+u -> U
	'e': ['a', 'i'], # a+i -> e
	'o': ['a', 'u'], # a+u -> o
	'ai': ['a', 'e'], # a+e -> ai
	'au': ['a', 'o'], # a+o -> au
	# Consonant clusters
	'cC': ['t', 'c'], # t+c -> cC
	'jj': ['d', 'j'], # d+j -> jj
	'DD': ['D', 'D'], #
	# Visarga Sandhi
	'o': ['aH'], # aH + vowel -> o
	'ar': ['aH'], # aH + r -> ar
	}

	def _try_sandhi_reversal(self, surface: str, min_stem_len: int = 3) -> List[str]:
	"""
	Try to recover original stems from Sandhi-modified surface forms.
	Returns list of possible original forms, ordered by likelihood.
	"""
	candidates = [surface] # Original form is always a candidate

	# TRANSLITERATION NORMALIZATION (lowercase digraph → SLP1 single char)
	# This handles: bh→B, dh→D, gh→G, ph→P, th→T, kh→K, ch→C, jh→J
	TRANSLIT_MAP = [
	('bh', 'B'), ('dh', 'D'), ('gh', 'G'), ('ph', 'P'),
	('th', 'T'), ('kh', 'K'), ('ch', 'C'), ('jh', 'J'),
	('Th', 'W'), ('Dh', 'Q'), # Retroflex aspirates
	]
	normalized = surface
	for digraph, single in TRANSLIT_MAP:
	normalized = normalized.replace(digraph, single)
	if normalized != surface:
	candidates.append(normalized)

	# Try consonant Sandhi at word boundary (last char)
	for form in [surface, normalized]:
	if len(form) >= min_stem_len and form[-1] in self.SANDHI_REVERSIONS:
	for original in self.SANDHI_REVERSIONS[form[-1]]:
	candidate = form[:-1] + original
	if candidate not in candidates:
	candidates.append(candidate)

	# Try internal Sandhi (for compound-internal changes)
	# e.g., buddhy -> buddhi (y often represents elided i)
	for form in [surface, normalized]:
	if form.endswith('y') and len(form) >= min_stem_len:
	candidates.append(form[:-1] + 'i') # Try y -> i
	if form.endswith('v') and len(form) >= min_stem_len:
	candidates.append(form[:-1] + 'u') # Try v -> u

	# Remove duplicates while preserving order
	seen = set()
	unique = []
	for c in candidates:
	if c not in seen:
	seen.add(c)
	unique.append(c)

	return unique

	def _is_valid_stem(self, surface: str) -> bool:
	"""
	Check if a surface form is a valid stem, trying:
	0. COMMON_WORDS protection
	1. Direct Kosha lookup
	2. Visarga/Anusvara base check (rAmaH → rAma)
	3. Sandhi reversal
	4. Pratyaya (suffix) stripping
	"""
	if len(surface) < 2:
	return False

	# 0. Safety Check for Common Words (protect namaH, rAmo, etc.)
	if surface in self.COMMON_WORDS:
	return True

	# 1. Direct Kosha Check
	if self.analyzer._in_kosha(surface):
	return True

	# 2. Visarga/Anusvara Check (FIX for rAmaH validation)
	# If sandhi-restored "rAmo" → "rAmaH", accept it if base "rAma" is in kosha
	if surface.endswith('H') and len(surface) > 2:
	base = surface[:-1]
	if self.analyzer._in_kosha(base):
	return True
	if surface.endswith('M') and len(surface) > 2:
	base = surface[:-1]
	if self.analyzer._in_kosha(base):
	return True

	# 3. Try all Sandhi reversal candidates
	candidates = self._try_sandhi_reversal(surface)
	for candidate in candidates:
	if self.analyzer._in_kosha(candidate):
	return True
	# Also try vowel adjustments
	if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
	return True
	if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
	return True
	if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
	return True
	# Recursive visarga check for candidates too
	if candidate.endswith('H') and len(candidate) > 2:
	if self.analyzer._in_kosha(candidate[:-1]):
	return True

	# Try VIBHAKTI STRIPPING (nominal case endings)
	VIBHAKTI_ENDINGS = [
	'am', 'aH', 'ena', 'Aya', 'At', 'asya', 'e', 'AH', # Masculine a-stem
	'An', 'EH', 'eBya', 'AnAm', 'ezu', # Masculine a-stem plural
	'au', 'OH', 'AvyAm', # Dual
	'aye', # i-stem dative (pataye, munaye)
	'ave', # u-stem dative (vizRave, gurave)
	]
	for ending in sorted(VIBHAKTI_ENDINGS, key=len, reverse=True):
	if surface.endswith(ending) and len(surface) > len(ending) + 2:
	stem = surface[:-len(ending)]
	if self.analyzer._in_kosha(stem):
	return True
	# Try with 'a' restoration (munipuMgavam → munipuMgava)
	if self.analyzer._in_kosha(stem + 'a'):
	return True

	# SPECIAL CASE: 'aye' ending implies 'i' stem (pataye → pati)
	if ending == 'aye' and self.analyzer._in_kosha(stem + 'i'):
	return True

	# SPECIAL CASE: 'ave' ending implies 'u' stem (gurave → guru)
	if ending == 'ave' and self.analyzer._in_kosha(stem + 'u'):
	return True

	# Try PRATYAYA STRIPPING (grammatical suffix removal)
	# This is Panini's kRt/taddhita system - generalizes to ALL Sanskrit
	PRATYAYAS = [
	('ana', 3), # lyuT: action noun (karaNa from kR)
	('Ana', 3), # śānac: present participle
	('tva', 3), # tva: abstract noun (devatva from deva)
	('tA', 2), # tal: abstract noun (sundaratA)
	('ya', 2), # yat: fitness/gerundive
	('ta', 2), # kta: past participle
	('ti', 2), # ktin: action noun
	('in', 2), # ṇini: possessor
	('ika', 3), # ṭhak: related to
	('Iya', 3), # cha: related to
	# Feminine/agent kṛdanta suffixes (Fix 2)
	('iRi', 3), # iṇī: feminine agent (ākarṣiṇī)
	('iRI', 3), # iṇī: alt spelling
	('inI', 3), # inī: feminine possessor (yoginī)
	('ikA', 3), # ikā: feminine derivative (nāyikā)
	('trI', 3), # trī: feminine agent (kartrī)
	]

	for suffix, min_root in PRATYAYAS:
	if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
	root = surface[:-len(suffix)]
	# Try the root in Kosha
	if self.analyzer._in_kosha(root):
	return True
	# Try with guṇa 'a' restoration
	if self.analyzer._in_kosha(root + 'a'):
	return True
	# Try R→f transliteration (MW uses f for ṛ: kartRI → kartf)
	root_f = root.replace('R', 'f')
	if root_f != root and self.analyzer._in_kosha(root_f):
	return True
	# Try Sandhi reversal on root
	for r in self._try_sandhi_reversal(root):
	if self.analyzer._in_kosha(r):
	return True

	return False

	def _count_kosha_heads(self, surface: str, min_head_len: int = 5) -> int:
	"""
	FIX 2: Count how many valid kosha stems exist inside a long string.
	Used to detect mega-tokens that swallowed multiple stems.
	"""
	if len(surface) < min_head_len * 2:
	return 1 if self._is_valid_stem(surface) else 0

	heads = 0
	i = 0
	while i < len(surface) - min_head_len + 1:
	# Try to find a valid stem starting at position i
	for j in range(min(len(surface), i + 15), i + min_head_len - 1, -1):
	candidate = surface[i:j]
	if len(candidate) >= min_head_len and self._is_valid_stem(candidate):
	heads += 1
	i = j # Skip past this head
	break
	else:
	i += 1
	return max(heads, 1 if self._is_valid_stem(surface) else 0)

	def _is_krdanta(self, surface: str) -> bool:
	"""
	FIX 3: Recognize kṛdanta (verbal derivative) forms.
	These should be kept as units, not split further.

	Kṛdanta indicators:
	- Ends with participial suffix preceded by verbal root
	- The whole form is in kosha as a recognized derivative
	"""
	KRDANTA_SUFFIXES = [
	('mAna', 4), # Present participle (ātmanepada)
	('Ana', 3), # Present participle
	('tavat', 5), # Past active participle
	('ta', 2), # Past passive participle (kta)
	('in', 2), # Agent noun (ṇini)
	('aka', 3), # Agent noun (ṇvul)
	('tR', 2), # Agent noun (tṛc)
	]

	for suffix, min_root in KRDANTA_SUFFIXES:
	if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
	root = surface[:-len(suffix)]
	# Check if root looks like a valid verbal root
	# Valid roots are usually in kosha
	for candidate in self._try_sandhi_reversal(root):
	if self.analyzer._in_kosha(candidate):
	return True
	return False

	def _recursive_split(self, word: str, memo: dict = None) -> List[str]:
	"""
	Recursively split a compound into maximal valid components.

	IMPROVED ALGORITHM with three fixes:
	1. FIX 1: Derivational spine continuation - keep collapsing if stem+suffix both valid
	2. FIX 2: Multi-head splitting - if token has multiple kosha heads, force split
	3. FIX 3: Kṛdanta recognition - keep participles as atomic units

	Uses memoization to avoid exponential blowup.
	"""
	if memo is None:
	memo = {}

	if word in memo:
	return memo[word]

	# FIX 3: If it's a recognized kṛdanta, keep it atomic
	if self._is_krdanta(word) and self._is_valid_stem(word):
	memo[word] = [word]
	return [word]

	# FIX 2: Force split if token is long and contains multiple kosha heads
	MAX_TOKEN_LEN = 15 # Tokens longer than this that have multiple heads must split
	if len(word) > MAX_TOKEN_LEN:
	head_count = self._count_kosha_heads(word)
	if head_count > 1:
	# Don't return early - we MUST try to split this
	pass # Continue to splitting logic
	else:
	# Single head or no heads - if valid, keep it
	if self._is_valid_stem(word):
	memo[word] = [word]
	return [word]
	else:
	# Base case: if word itself is valid AND not too long, return it
	if self._is_valid_stem(word):
	memo[word] = [word]
	return [word]

	# Base case: too short to split
	if len(word) < 4:
	memo[word] = [word]
	return [word]

	best_parse = [word] # Default: no split
	best_score = -1000 # Start negative to ensure any valid split wins

	min_len = 3 # Minimum 3 chars to prevent rA, nA splits

	# Try all split points
	for i in range(min_len, len(word) - min_len + 1):
	left = word[:i]
	right = word[i:]

	# Check if left is valid (with Sandhi reversal)
	if self._is_valid_stem(left):
	# FIX 1: Derivational spine continuation
	# If left is a valid stem, check if left+next_suffix also forms a valid stem
	# This prevents over-splitting inside known words like bhAvanA
	spine_continued = False
	for ext_len in range(3, min(len(right) + 1, 8)): # Try extending by 3-7 chars
	extended = left + right[:ext_len]
	if self._is_valid_stem(extended):
	# The spine continues! Don't split here, try a longer left
	spine_continued = True
	break

	# Only split if spine doesn't continue OR if we're at a very long boundary
	if spine_continued and len(left) < 10:
	continue # Skip this split point, try longer

	# Recursively split the right side
	right_parse = self._recursive_split(right, memo)

	# Count valid components in this parse
	full_parse = [left] + right_parse
	valid_count = sum(1 for comp in full_parse if self._is_valid_stem(comp))

	# IMPROVED SCORING:
	# 1. Reward valid components heavily
	# 2. PENALIZE many components (prefer fewer, longer splits)
	# 3. PENALIZE short components (< 5 chars)
	# 4. REWARD if components are known kosha stems (not just valid via suffix)
	num_components = len(full_parse)
	avg_len = sum(len(c) for c in full_parse) / num_components
	short_penalty = sum(1 for c in full_parse if len(c) < 5)

	# Bonus for components that are DIRECTLY in kosha (not via suffix stripping)
	direct_kosha_bonus = sum(10 for c in full_parse
	if self.analyzer._in_kosha(c) or
	any(self.analyzer._in_kosha(x) for x in self._try_sandhi_reversal(c)))

	# Score formula: favor valid + long + few components + direct kosha
	score = (valid_count * 100 # Valid components matter most
	- num_components * 15 # Penalize many splits (reduced from 20)
	+ avg_len * 5 # Reward longer components
	- short_penalty * 40 # Penalize short fragments (reduced from 50)
	+ direct_kosha_bonus) # Bonus for direct kosha stems

	if score > best_score:
	best_score = score
	best_parse = full_parse

	memo[word] = best_parse
	return best_parse

	def _longest_left_split(self, word: str) -> Optional[Tuple[str, str]]:
	"""
	Find the longest valid left stem greedily WITH SANDHI REVERSAL.

	For unknown prefixes, tries consonant/vowel Sandhi reversions:
	- vidyud -> vidyut (d -> t before vowel)
	- buddhy -> buddhi (y -> i for elided vowel)
	"""
	min_len = 3 # Minimum valid stem length

	# Scan from longest left to shortest
	for i in range(len(word) - min_len, min_len - 1, -1):
	left = word[:i]
	right = word[i:]

	# Try ALL Sandhi reversal candidates for left
	left_valid = False
	left_candidates = self._try_sandhi_reversal(left)
	for candidate in left_candidates:
	if self.analyzer._in_kosha(candidate):
	left_valid = True
	break
	# Also try with vowel adjustments
	if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
	left_valid = True
	break
	if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
	left_valid = True
	break
	if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
	left_valid = True
	break

	if left_valid and len(right) >= min_len:
	# Check if right is valid using Sandhi reversal
	right_valid = False
	right_candidates = self._try_sandhi_reversal(right)
	for candidate in right_candidates:
	if self.analyzer._in_kosha(candidate):
	right_valid = True
	break
	# Try with vowel adjustments
	if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
	right_valid = True
	break

	# Try lookahead on right (for compound remainders)
	if not right_valid:
	for j in range(min_len, min(len(right), 15)):
	prefix = right[:j]
	# Try all Sandhi reversals on the prefix
	prefix_candidates = self._try_sandhi_reversal(prefix)
	for candidate in prefix_candidates:
	if self.analyzer._in_kosha(candidate):
	right_valid = True
	break
	if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
	right_valid = True
	break
	if right_valid:
	break

	# Sandhi restoration: if left ended with long vowel, right may need prefix
	if not right_valid and left.endswith('A') and right[0] not in 'aAiIuUeEoO':
	restored = 'A' + right
	restored_candidates = self._try_sandhi_reversal(restored)
	for candidate in restored_candidates:
	if self.analyzer._in_kosha(candidate):
	right_valid = True
	break
	if not right_valid:
	for j in range(min_len, min(len(restored), 12)):
	if self.analyzer._in_kosha(restored[:j]):
	right_valid = True
	break

	if right_valid:
	return (left, right)

	return None

	def _find_split_candidates(self, word: str) -> List[int]:
	"""Find potential split points based on stem cache validation."""
	candidates = []
	min_component = 2 # Minimum component length

	# Endings to strip when validating
	ENDINGS = ["M", "H", "aM", "am", "aH", "At", "ena", "Aya", "asya",
	"e", "O", "AnAm", "A", "I", "U", "AN", "An", "i"]

	for i in range(min_component, len(word) - min_component + 1):
	left = word[:i]
	right = word[i:]

	# Check left side (try as-is, then with vowel additions/normalization)
	left_valid = self.analyzer._in_kosha(left)
	if not left_valid:
	for suffix in ["a", "A", "i", "I", "u", "U"]:
	if self.analyzer._in_kosha(left + suffix):
	left_valid = True
	break
	# Sandhi reversal: if left ends with long vowel, try normalizing
	if not left_valid and left.endswith('A'):
	if self.analyzer._in_kosha(left[:-1] + 'a'):
	left_valid = True
	if not left_valid and left.endswith('I'):
	if self.analyzer._in_kosha(left[:-1] + 'i'):
	left_valid = True
	if not left_valid and left.endswith('U'):
	if self.analyzer._in_kosha(left[:-1] + 'u'):
	left_valid = True

	# Check right side (try as-is, strip endings, add vowels)
	right_valid = self.analyzer._in_kosha(right)
	if not right_valid:
	# Try stripping endings
	for ending in sorted(ENDINGS, key=len, reverse=True):
	if right.endswith(ending) and len(right) > len(ending) + 1:
	stripped = right[:-len(ending)]
	if self.analyzer._in_kosha(stripped):
	right_valid = True
	break
	# Also try with vowel additions
	for suffix in ["a", "A"]:
	if self.analyzer._in_kosha(stripped + suffix):
	right_valid = True
	break
	if right_valid:
	break

	if not right_valid:
	# Try vowel additions
	for suffix in ["a", "A", "i", "I"]:
	if self.analyzer._in_kosha(right + suffix):
	right_valid = True
	break

	# Sandhi reversal for right side: if left ends with long vowel,
	# the vowel may have absorbed initial vowel of right.
	# Try restoring: AtmA\|bhAsa -> check A+bhAsa = AbhAsa
	if not right_valid and len(right) > 2:
	# Check if left ends with long vowel that could have eaten something
	if left.endswith('A') and right[0] not in 'aAiIuUeEoO':
	# Right starts with consonant - maybe initial A was eaten
	restored = 'A' + right
	if self.analyzer._in_kosha(restored):
	right_valid = True
	elif len(restored) > 3:
	# Try lookahead on restored
	for j in range(3, min(len(restored), 12)):
	if self.analyzer._in_kosha(restored[:j]):
	right_valid = True
	break
	elif left.endswith('I') and right[0] not in 'aAiIuUeEoO':
	restored = 'I' + right
	if self.analyzer._in_kosha(restored):
	right_valid = True
	elif left.endswith('U') and right[0] not in 'aAiIuUeEoO':
	restored = 'U' + right
	if self.analyzer._in_kosha(restored):
	right_valid = True

	# Also check if right itself starts a sub-compound (Recursive Lookahead)
	if not right_valid and len(right) > 3:
	# Try to find ANY valid item at start of right
	# Check prefixes of length 3 to 12
	for j in range(3, min(len(right), 15)):
	prefix = right[:j]
	if self.analyzer._in_kosha(prefix):
	right_valid = True
	break
	# Sandhi normalization: if prefix ends with long vowel, try short
	# AtmA -> Atma, prAtI -> prAti, etc.
	if prefix.endswith('A'):
	normalized = prefix[:-1] + 'a'
	if self.analyzer._in_kosha(normalized):
	right_valid = True
	break
	elif prefix.endswith('I'):
	normalized = prefix[:-1] + 'i'
	if self.analyzer._in_kosha(normalized):
	right_valid = True
	break
	elif prefix.endswith('U'):
	normalized = prefix[:-1] + 'u'
	if self.analyzer._in_kosha(normalized):
	right_valid = True
	break

	# If still not found, check known initials
	if not right_valid:
	for initial in self.COMPOUND_INITIALS + list(self.COMPOUND_FINALS):
	if right.startswith(initial) and len(initial) >= 2:
	right_valid = True
	break

	# DEBUG
	# if "sopAdhika" in word:
	# print(f"Check {left} \| {right} -> L:{left_valid} R:{right_valid}")

	if left_valid and right_valid:
	candidates.append(i)

	return candidates

	def score_split(components):
	# Base: Squared length favors fewer, longer components
	score = sum(len(c)**2 for c in components)

	# --- PENALTIES ---
	for c in components:
	if len(c) < 4:
	if not self._is_valid_stem(c):
	score -= 50
	else:
	score -= 5

	if len(components) > 2:
	score -= (len(components) - 2) * 20

	# --- BONUSES ---

	# 1. VALIDITY BONUS (THE FIX)
	# Old value: 30. New value: 100.
	# This ensures that 164 (split score) + 200 (bonus) > 289 (garbage score)
	valid_count = sum(1 for c in components if self._is_valid_stem(c))
	score += valid_count * 100

	# 2. SURVIVAL BONUS (Protects rAmo, namaH)
	if len(components) == 1:
	if self._is_valid_stem(components[0]):
	score += 50

	# 3. Compound Pattern Bonus
	if len(components) >= 2:
	left = components[0]
	right = components[-1]

	if left in self.COMPOUND_INITIALS: score += 15

	# Check Right Final
	r_stem, _ = self.analyzer._extract_vibhakti(right)
	if r_stem in self.COMPOUND_FINALS: score += 25
	elif right in self.COMPOUND_FINALS: score += 25

	if abs(len(left) - len(right)) <= 1: score += 10
	# 4. Expansion penalty (RELAXED)
	# We removed the "elif expansion == 0: score += 20" trap.
	total_len = sum(len(c) for c in components)
	expansion = total_len - len(word)
	if expansion > 1:
	score -= (expansion - 1) * 25
	return score

	def split(self, word: str, max_components: int = 4) -> CompoundSplit:
	"""
	Split a compound word into its components.

	Uses greedy algorithm with Kosha validation.
	Returns original word if no valid split found.
	"""
	if len(word) < 4:
	return CompoundSplit(
	surface=word, components=[word],
	split_points=[], is_compound=False, compound_type=None
	)

	# Check if word itself is in Kosha (might not be compound)
	# KEY FIX: If word is already a known stem (lexicalized), DO NOT SPLIT
	# This protects 'paramAtma', 'kzetrajYa', 'sopAdhika' from being broken down
	if self.analyzer._in_kosha(word):
	return CompoundSplit(
	surface=word, components=[word],
	split_points=[], is_compound=False, compound_type=None
	)

	# Use RECURSIVE COMPOSITIONAL algorithm
	# Tries ALL split points, recursively parses right sides,
	# returns parse with MOST valid components
	components = self._recursive_split(word)

	if len(components) <= 1:
	return CompoundSplit(
	surface=word, components=[word],
	split_points=[], is_compound=False, compound_type=None
	)

	# Calculate split points from components
	split_points = []
	pos = 0
	for comp in components[:-1]:
	pos += len(comp)
	split_points.append(pos)

	return CompoundSplit(
	surface=word, components=components,
	split_points=split_points, is_compound=True,
	compound_type=None # We don't classify samāsa types
	)

	def _split_dp(self, word: str, memo: dict = None) -> List[List[str]]:
	"""
	V4 Algorithm: Memoized Dynamic Programming with Sandhi Expansion.

	Returns all valid splits, cached by suffix.
	Handles coalescent sandhi (e=a+i, o=a+u, etc.) that V3 misses.
	"""
	if memo is None:
	memo = {}

	if word in memo:
	return memo[word]

	# Base: too short to split
	if len(word) <= 2:
	if self._is_valid_stem(word):
	return [[word]]
	return []

	valid_splits = []

	# 1. OPTION A: The whole word is a stem (Lexicalized)
	if self._is_valid_stem(word):
	valid_splits.append([word])
	# DO NOT RETURN EARLY. Keep looking for splits!

	# 2. OPTION B: Split it (Generative Sandhi)
	# Try each split position with sandhi expansion
	for i in range(2, len(word) - 1):
	for left, right in self.sandhi_engine.generate_splits(word, i):
	if len(left) < 2 or len(right) < 2:
	continue

	if self._is_valid_stem(left):
	# Recurse on right (memoized!)
	right_splits = self._split_dp(right, memo)
	for rs in right_splits:
	valid_splits.append([left] + rs)

	memo[word] = valid_splits
	return valid_splits

	def split_v4(self, word: str) -> CompoundSplit:
	"""
	V4 Split: Uses generative sandhi expansion for coalescent sandhi.

	Handles:
	- Vowel coalescence: gaṇeśa → gaṇa + īśa (e = a+i)
	- Visarga sandhi: punarjanma → punaH + janma
	- Vṛddhi: tavaiva → tava + eva
	"""
	if len(word) < 4:
	return CompoundSplit(
	surface=word, components=[word],
	split_points=[], is_compound=False, compound_type=None
	)

	# Use V4 DP algorithm
	all_splits = self._split_dp(word)

	if not all_splits:
	return CompoundSplit(
	surface=word, components=[word],
	split_points=[], is_compound=False, compound_type=None
	)

	# SCORING STRATEGY:
	# Balance: prefer splits, but penalize over-fragmentation.
	# 1. Penalize short components (< 3 chars) heavily
	# 2. Prefer 2-component splits over 3+ components
	# 3. Single long tokens get moderate penalty
	# V4 Scoring with Compound Pattern Recognition
	def score_split(components):
	# Base: Squared length favors fewer, longer components
	score = sum(len(c)**2 for c in components)

	# --- PENALTIES ---
	# 1. Short junk penalty (unless it's a valid stem)
	for c in components:
	if len(c) < 4:
	if not self._is_valid_stem(c):
	score -= 50 # Garbage fragment
	else:
	score -= 5 # Valid but short (e.g. 'ISa'), slight penalty

	# 2. Fragmentation penalty
	if len(components) > 2:
	score -= (len(components) - 2) * 30 # Increased penalty

	# 3. 2-component bonus (optimal compound structure)
	if len(components) == 2:
	score += 25

	# --- BONUSES ---
	# 0. COMMON_WORDS Protection (namaH, rAmo should stay atomic)
	if len(components) == 1 and components[0] in self.COMMON_WORDS:
	score += 50 # Strong bonus to prevent splitting

	# 1. Validity Bonus (Crucial for pataye/rAmo)
	# Use _is_valid_stem so declined words get credit
	valid_count = sum(1 for c in components if self._is_valid_stem(c))
	score += valid_count * 30

	# 2. Compound Pattern Bonus (The Fix for gaRapataye)
	if len(components) >= 2:
	left = components[0]
	right = components[-1]

	# Check Left against Initials
	if left in self.COMPOUND_INITIALS:
	score += 15

	# Check Right against Finals
	# Need to extract stem to match (pataye -> pati)
	for final in self.COMPOUND_FINALS:
	if right.startswith(final) or right == final:
	score += 25 # High bonus for matching pattern like 'pati'
	break
	# Try stripping vibhakti
	if right.endswith('aye') and right[:-3] + 'i' == final:
	score += 25
	break
	if right.endswith('ave') and right[:-3] + 'u' == final:
	score += 25
	break

	# Balance bonus
	if abs(len(left) - len(right)) <= 1:
	score += 10

	# 4. Expansion penalty (sandhi artifacts add characters)
	# Allow 1 char expansion for sandhi (e → a+I), only penalize 2+ extra chars
	total_len = sum(len(c) for c in components)
	expansion = total_len - len(word)
	if expansion > 1:
	score -= (expansion - 1) * 25 # Stronger penalty
	elif expansion == 0:
	score += 20 # Bonus for exact-length splits (no sandhi artifact)

	return score

	best_split = max(all_splits, key=score_split)

	if len(best_split) <= 1:
	return CompoundSplit(
	surface=word, components=[word],
	split_points=[], is_compound=False, compound_type=None
	)

	return CompoundSplit(
	surface=word, components=best_split,
	split_points=[], is_compound=True, compound_type=None
	)

	def split_multiple(self, words: List[str]) -> List[CompoundSplit]:
	"""Split multiple words."""
	return [self.split(w) for w in words]


	# --- TEST ---
	if __name__ == "__main__":
	print("Testing SamasaSplitter...")
	splitter = SamasaSplitter()

	test_compounds = [
	"hfdpadma",
	"paramAtma",
	"mahArAja",
	"devadatta",
	"rAjakumAra",
	"sopAdhika",
	]

	for word in test_compounds:
	result = splitter.split(word)
	if result.is_compound:
	print(f" {word:20} → {' + '.join(result.components)}")
	else:
	print(f" {word:20} → (not split)")