panini-tokenizer / src /splitter.py
ArthaLabs's picture
Upload src/splitter.py with huggingface_hub
e1cd59b verified
"""
Samāsa (Compound) Splitter
Detects and splits Sanskrit compound words at their boundaries.
"""
from typing import List, Tuple, Optional
from dataclasses import dataclass
# Import analyzer for Kosha access (absolute imports for HF compatibility)
from analyzer import VidyutAnalyzer, MorphParse
from sandhi_engine import SandhiEngine
@dataclass
class CompoundSplit:
"""Result of compound splitting."""
surface: str # Original compound
components: List[str] # Split components
split_points: List[int] # Character positions of splits
is_compound: bool # Was this actually a compound?
compound_type: Optional[str] # tatpuruṣa, dvandva, bahuvrīhi, etc.
class SamasaSplitter:
"""
Splits Sanskrit compound words (samāsa) at their boundaries.
Uses Kosha lookups to validate potential split points.
"""
# Common compound final elements (uttarapada patterns)
COMPOUND_FINALS = [
"kara", "kAra", "kArin", "kft", "kftya",
"gata", "gati", "gamana",
"ja", "jAta", "janman",
"Da", "DAra", "DAraka", "DArin",
"maya", "mat", "vat",
"pati", "nATa", "ISvara", "adhipa",
"Atman", "rUpa", "svarUpa",
"pada", "pAduka",
"stha", "sthita", "sthAna",
"yukta", "hIna", "rahita",
"priya", "rata", "ASrita",
"vid", "jYa", "vadin", "pAla",
"rAja", "indra", "deva", "loka",
"karziR", "AkarziRi","ISa", # Loving/devoted
]
# Common compound first elements (pūrvapada patterns)
COMPOUND_INITIALS = [
"mahA", "ati", "su", "dur", "sat", "a", "an", # Prefixes
"sarva", "viSva", "eka", "bahu", # All/one/many
"deva", "brahma", "Atma", "para", # Divine/supreme
"rAja", "mahI", "loka", # King/earth/world
"hfd", "manas", "citta", # Heart/mind
"padma", "kamala", "Ananda", "ISa", # Lotus
]
# Hardcoded protection for high-frequency words that might be over-split
COMMON_WORDS = {
"namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
"pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya","AkarziRi",
}
def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
"""Initialize with optional shared analyzer."""
self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
self.sandhi_engine = SandhiEngine() # V4: Generative sandhi expansion
# Sandhi reversal rules: (surface_ending, possible_original_endings)
# These are common consonant/vowel Sandhi transformations to reverse
SANDHI_REVERSIONS = {
# Consonant Sandhi (final consonant before vowel)
'd': ['t', 'd'], # vidyud -> vidyut
'g': ['k', 'g'], # vAg -> vAk
'b': ['p', 'b'], # ap -> ab (water)
'D': ['T', 'D'], #
'j': ['c', 'j'], #
'z': ['s', 'z'], #
# Vowel Sandhi (vowel combinations)
'A': ['a', 'A'], # a+a -> A
'I': ['i', 'I'], # i+i -> I
'U': ['u', 'U'], # u+u -> U
'e': ['a', 'i'], # a+i -> e
'o': ['a', 'u'], # a+u -> o
'ai': ['a', 'e'], # a+e -> ai
'au': ['a', 'o'], # a+o -> au
# Consonant clusters
'cC': ['t', 'c'], # t+c -> cC
'jj': ['d', 'j'], # d+j -> jj
'DD': ['D', 'D'], #
# Visarga Sandhi
'o': ['aH'], # aH + vowel -> o
'ar': ['aH'], # aH + r -> ar
}
def _try_sandhi_reversal(self, surface: str, min_stem_len: int = 3) -> List[str]:
"""
Try to recover original stems from Sandhi-modified surface forms.
Returns list of possible original forms, ordered by likelihood.
"""
candidates = [surface] # Original form is always a candidate
# TRANSLITERATION NORMALIZATION (lowercase digraph → SLP1 single char)
# This handles: bh→B, dh→D, gh→G, ph→P, th→T, kh→K, ch→C, jh→J
TRANSLIT_MAP = [
('bh', 'B'), ('dh', 'D'), ('gh', 'G'), ('ph', 'P'),
('th', 'T'), ('kh', 'K'), ('ch', 'C'), ('jh', 'J'),
('Th', 'W'), ('Dh', 'Q'), # Retroflex aspirates
]
normalized = surface
for digraph, single in TRANSLIT_MAP:
normalized = normalized.replace(digraph, single)
if normalized != surface:
candidates.append(normalized)
# Try consonant Sandhi at word boundary (last char)
for form in [surface, normalized]:
if len(form) >= min_stem_len and form[-1] in self.SANDHI_REVERSIONS:
for original in self.SANDHI_REVERSIONS[form[-1]]:
candidate = form[:-1] + original
if candidate not in candidates:
candidates.append(candidate)
# Try internal Sandhi (for compound-internal changes)
# e.g., buddhy -> buddhi (y often represents elided i)
for form in [surface, normalized]:
if form.endswith('y') and len(form) >= min_stem_len:
candidates.append(form[:-1] + 'i') # Try y -> i
if form.endswith('v') and len(form) >= min_stem_len:
candidates.append(form[:-1] + 'u') # Try v -> u
# Remove duplicates while preserving order
seen = set()
unique = []
for c in candidates:
if c not in seen:
seen.add(c)
unique.append(c)
return unique
def _is_valid_stem(self, surface: str) -> bool:
"""
Check if a surface form is a valid stem, trying:
0. COMMON_WORDS protection
1. Direct Kosha lookup
2. Visarga/Anusvara base check (rAmaH → rAma)
3. Sandhi reversal
4. Pratyaya (suffix) stripping
"""
if len(surface) < 2:
return False
# 0. Safety Check for Common Words (protect namaH, rAmo, etc.)
if surface in self.COMMON_WORDS:
return True
# 1. Direct Kosha Check
if self.analyzer._in_kosha(surface):
return True
# 2. Visarga/Anusvara Check (FIX for rAmaH validation)
# If sandhi-restored "rAmo" → "rAmaH", accept it if base "rAma" is in kosha
if surface.endswith('H') and len(surface) > 2:
base = surface[:-1]
if self.analyzer._in_kosha(base):
return True
if surface.endswith('M') and len(surface) > 2:
base = surface[:-1]
if self.analyzer._in_kosha(base):
return True
# 3. Try all Sandhi reversal candidates
candidates = self._try_sandhi_reversal(surface)
for candidate in candidates:
if self.analyzer._in_kosha(candidate):
return True
# Also try vowel adjustments
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
return True
if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
return True
if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
return True
# Recursive visarga check for candidates too
if candidate.endswith('H') and len(candidate) > 2:
if self.analyzer._in_kosha(candidate[:-1]):
return True
# Try VIBHAKTI STRIPPING (nominal case endings)
VIBHAKTI_ENDINGS = [
'am', 'aH', 'ena', 'Aya', 'At', 'asya', 'e', 'AH', # Masculine a-stem
'An', 'EH', 'eBya', 'AnAm', 'ezu', # Masculine a-stem plural
'au', 'OH', 'AvyAm', # Dual
'aye', # i-stem dative (pataye, munaye)
'ave', # u-stem dative (vizRave, gurave)
]
for ending in sorted(VIBHAKTI_ENDINGS, key=len, reverse=True):
if surface.endswith(ending) and len(surface) > len(ending) + 2:
stem = surface[:-len(ending)]
if self.analyzer._in_kosha(stem):
return True
# Try with 'a' restoration (munipuMgavam → munipuMgava)
if self.analyzer._in_kosha(stem + 'a'):
return True
# SPECIAL CASE: 'aye' ending implies 'i' stem (pataye → pati)
if ending == 'aye' and self.analyzer._in_kosha(stem + 'i'):
return True
# SPECIAL CASE: 'ave' ending implies 'u' stem (gurave → guru)
if ending == 'ave' and self.analyzer._in_kosha(stem + 'u'):
return True
# Try PRATYAYA STRIPPING (grammatical suffix removal)
# This is Panini's kRt/taddhita system - generalizes to ALL Sanskrit
PRATYAYAS = [
('ana', 3), # lyuT: action noun (karaNa from kR)
('Ana', 3), # śānac: present participle
('tva', 3), # tva: abstract noun (devatva from deva)
('tA', 2), # tal: abstract noun (sundaratA)
('ya', 2), # yat: fitness/gerundive
('ta', 2), # kta: past participle
('ti', 2), # ktin: action noun
('in', 2), # ṇini: possessor
('ika', 3), # ṭhak: related to
('Iya', 3), # cha: related to
# Feminine/agent kṛdanta suffixes (Fix 2)
('iRi', 3), # iṇī: feminine agent (ākarṣiṇī)
('iRI', 3), # iṇī: alt spelling
('inI', 3), # inī: feminine possessor (yoginī)
('ikA', 3), # ikā: feminine derivative (nāyikā)
('trI', 3), # trī: feminine agent (kartrī)
]
for suffix, min_root in PRATYAYAS:
if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
root = surface[:-len(suffix)]
# Try the root in Kosha
if self.analyzer._in_kosha(root):
return True
# Try with guṇa 'a' restoration
if self.analyzer._in_kosha(root + 'a'):
return True
# Try R→f transliteration (MW uses f for ṛ: kartRI → kartf)
root_f = root.replace('R', 'f')
if root_f != root and self.analyzer._in_kosha(root_f):
return True
# Try Sandhi reversal on root
for r in self._try_sandhi_reversal(root):
if self.analyzer._in_kosha(r):
return True
return False
def _count_kosha_heads(self, surface: str, min_head_len: int = 5) -> int:
"""
FIX 2: Count how many valid kosha stems exist inside a long string.
Used to detect mega-tokens that swallowed multiple stems.
"""
if len(surface) < min_head_len * 2:
return 1 if self._is_valid_stem(surface) else 0
heads = 0
i = 0
while i < len(surface) - min_head_len + 1:
# Try to find a valid stem starting at position i
for j in range(min(len(surface), i + 15), i + min_head_len - 1, -1):
candidate = surface[i:j]
if len(candidate) >= min_head_len and self._is_valid_stem(candidate):
heads += 1
i = j # Skip past this head
break
else:
i += 1
return max(heads, 1 if self._is_valid_stem(surface) else 0)
def _is_krdanta(self, surface: str) -> bool:
"""
FIX 3: Recognize kṛdanta (verbal derivative) forms.
These should be kept as units, not split further.
Kṛdanta indicators:
- Ends with participial suffix preceded by verbal root
- The whole form is in kosha as a recognized derivative
"""
KRDANTA_SUFFIXES = [
('mAna', 4), # Present participle (ātmanepada)
('Ana', 3), # Present participle
('tavat', 5), # Past active participle
('ta', 2), # Past passive participle (kta)
('in', 2), # Agent noun (ṇini)
('aka', 3), # Agent noun (ṇvul)
('tR', 2), # Agent noun (tṛc)
]
for suffix, min_root in KRDANTA_SUFFIXES:
if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
root = surface[:-len(suffix)]
# Check if root looks like a valid verbal root
# Valid roots are usually in kosha
for candidate in self._try_sandhi_reversal(root):
if self.analyzer._in_kosha(candidate):
return True
return False
def _recursive_split(self, word: str, memo: dict = None) -> List[str]:
"""
Recursively split a compound into maximal valid components.
IMPROVED ALGORITHM with three fixes:
1. FIX 1: Derivational spine continuation - keep collapsing if stem+suffix both valid
2. FIX 2: Multi-head splitting - if token has multiple kosha heads, force split
3. FIX 3: Kṛdanta recognition - keep participles as atomic units
Uses memoization to avoid exponential blowup.
"""
if memo is None:
memo = {}
if word in memo:
return memo[word]
# FIX 3: If it's a recognized kṛdanta, keep it atomic
if self._is_krdanta(word) and self._is_valid_stem(word):
memo[word] = [word]
return [word]
# FIX 2: Force split if token is long and contains multiple kosha heads
MAX_TOKEN_LEN = 15 # Tokens longer than this that have multiple heads must split
if len(word) > MAX_TOKEN_LEN:
head_count = self._count_kosha_heads(word)
if head_count > 1:
# Don't return early - we MUST try to split this
pass # Continue to splitting logic
else:
# Single head or no heads - if valid, keep it
if self._is_valid_stem(word):
memo[word] = [word]
return [word]
else:
# Base case: if word itself is valid AND not too long, return it
if self._is_valid_stem(word):
memo[word] = [word]
return [word]
# Base case: too short to split
if len(word) < 4:
memo[word] = [word]
return [word]
best_parse = [word] # Default: no split
best_score = -1000 # Start negative to ensure any valid split wins
min_len = 3 # Minimum 3 chars to prevent rA, nA splits
# Try all split points
for i in range(min_len, len(word) - min_len + 1):
left = word[:i]
right = word[i:]
# Check if left is valid (with Sandhi reversal)
if self._is_valid_stem(left):
# FIX 1: Derivational spine continuation
# If left is a valid stem, check if left+next_suffix also forms a valid stem
# This prevents over-splitting inside known words like bhAvanA
spine_continued = False
for ext_len in range(3, min(len(right) + 1, 8)): # Try extending by 3-7 chars
extended = left + right[:ext_len]
if self._is_valid_stem(extended):
# The spine continues! Don't split here, try a longer left
spine_continued = True
break
# Only split if spine doesn't continue OR if we're at a very long boundary
if spine_continued and len(left) < 10:
continue # Skip this split point, try longer
# Recursively split the right side
right_parse = self._recursive_split(right, memo)
# Count valid components in this parse
full_parse = [left] + right_parse
valid_count = sum(1 for comp in full_parse if self._is_valid_stem(comp))
# IMPROVED SCORING:
# 1. Reward valid components heavily
# 2. PENALIZE many components (prefer fewer, longer splits)
# 3. PENALIZE short components (< 5 chars)
# 4. REWARD if components are known kosha stems (not just valid via suffix)
num_components = len(full_parse)
avg_len = sum(len(c) for c in full_parse) / num_components
short_penalty = sum(1 for c in full_parse if len(c) < 5)
# Bonus for components that are DIRECTLY in kosha (not via suffix stripping)
direct_kosha_bonus = sum(10 for c in full_parse
if self.analyzer._in_kosha(c) or
any(self.analyzer._in_kosha(x) for x in self._try_sandhi_reversal(c)))
# Score formula: favor valid + long + few components + direct kosha
score = (valid_count * 100 # Valid components matter most
- num_components * 15 # Penalize many splits (reduced from 20)
+ avg_len * 5 # Reward longer components
- short_penalty * 40 # Penalize short fragments (reduced from 50)
+ direct_kosha_bonus) # Bonus for direct kosha stems
if score > best_score:
best_score = score
best_parse = full_parse
memo[word] = best_parse
return best_parse
def _longest_left_split(self, word: str) -> Optional[Tuple[str, str]]:
"""
Find the longest valid left stem greedily WITH SANDHI REVERSAL.
For unknown prefixes, tries consonant/vowel Sandhi reversions:
- vidyud -> vidyut (d -> t before vowel)
- buddhy -> buddhi (y -> i for elided vowel)
"""
min_len = 3 # Minimum valid stem length
# Scan from longest left to shortest
for i in range(len(word) - min_len, min_len - 1, -1):
left = word[:i]
right = word[i:]
# Try ALL Sandhi reversal candidates for left
left_valid = False
left_candidates = self._try_sandhi_reversal(left)
for candidate in left_candidates:
if self.analyzer._in_kosha(candidate):
left_valid = True
break
# Also try with vowel adjustments
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
left_valid = True
break
if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
left_valid = True
break
if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
left_valid = True
break
if left_valid and len(right) >= min_len:
# Check if right is valid using Sandhi reversal
right_valid = False
right_candidates = self._try_sandhi_reversal(right)
for candidate in right_candidates:
if self.analyzer._in_kosha(candidate):
right_valid = True
break
# Try with vowel adjustments
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
right_valid = True
break
# Try lookahead on right (for compound remainders)
if not right_valid:
for j in range(min_len, min(len(right), 15)):
prefix = right[:j]
# Try all Sandhi reversals on the prefix
prefix_candidates = self._try_sandhi_reversal(prefix)
for candidate in prefix_candidates:
if self.analyzer._in_kosha(candidate):
right_valid = True
break
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
right_valid = True
break
if right_valid:
break
# Sandhi restoration: if left ended with long vowel, right may need prefix
if not right_valid and left.endswith('A') and right[0] not in 'aAiIuUeEoO':
restored = 'A' + right
restored_candidates = self._try_sandhi_reversal(restored)
for candidate in restored_candidates:
if self.analyzer._in_kosha(candidate):
right_valid = True
break
if not right_valid:
for j in range(min_len, min(len(restored), 12)):
if self.analyzer._in_kosha(restored[:j]):
right_valid = True
break
if right_valid:
return (left, right)
return None
def _find_split_candidates(self, word: str) -> List[int]:
"""Find potential split points based on stem cache validation."""
candidates = []
min_component = 2 # Minimum component length
# Endings to strip when validating
ENDINGS = ["M", "H", "aM", "am", "aH", "At", "ena", "Aya", "asya",
"e", "O", "AnAm", "A", "I", "U", "AN", "An", "i"]
for i in range(min_component, len(word) - min_component + 1):
left = word[:i]
right = word[i:]
# Check left side (try as-is, then with vowel additions/normalization)
left_valid = self.analyzer._in_kosha(left)
if not left_valid:
for suffix in ["a", "A", "i", "I", "u", "U"]:
if self.analyzer._in_kosha(left + suffix):
left_valid = True
break
# Sandhi reversal: if left ends with long vowel, try normalizing
if not left_valid and left.endswith('A'):
if self.analyzer._in_kosha(left[:-1] + 'a'):
left_valid = True
if not left_valid and left.endswith('I'):
if self.analyzer._in_kosha(left[:-1] + 'i'):
left_valid = True
if not left_valid and left.endswith('U'):
if self.analyzer._in_kosha(left[:-1] + 'u'):
left_valid = True
# Check right side (try as-is, strip endings, add vowels)
right_valid = self.analyzer._in_kosha(right)
if not right_valid:
# Try stripping endings
for ending in sorted(ENDINGS, key=len, reverse=True):
if right.endswith(ending) and len(right) > len(ending) + 1:
stripped = right[:-len(ending)]
if self.analyzer._in_kosha(stripped):
right_valid = True
break
# Also try with vowel additions
for suffix in ["a", "A"]:
if self.analyzer._in_kosha(stripped + suffix):
right_valid = True
break
if right_valid:
break
if not right_valid:
# Try vowel additions
for suffix in ["a", "A", "i", "I"]:
if self.analyzer._in_kosha(right + suffix):
right_valid = True
break
# Sandhi reversal for right side: if left ends with long vowel,
# the vowel may have absorbed initial vowel of right.
# Try restoring: AtmA|bhAsa -> check A+bhAsa = AbhAsa
if not right_valid and len(right) > 2:
# Check if left ends with long vowel that could have eaten something
if left.endswith('A') and right[0] not in 'aAiIuUeEoO':
# Right starts with consonant - maybe initial A was eaten
restored = 'A' + right
if self.analyzer._in_kosha(restored):
right_valid = True
elif len(restored) > 3:
# Try lookahead on restored
for j in range(3, min(len(restored), 12)):
if self.analyzer._in_kosha(restored[:j]):
right_valid = True
break
elif left.endswith('I') and right[0] not in 'aAiIuUeEoO':
restored = 'I' + right
if self.analyzer._in_kosha(restored):
right_valid = True
elif left.endswith('U') and right[0] not in 'aAiIuUeEoO':
restored = 'U' + right
if self.analyzer._in_kosha(restored):
right_valid = True
# Also check if right itself starts a sub-compound (Recursive Lookahead)
if not right_valid and len(right) > 3:
# Try to find ANY valid item at start of right
# Check prefixes of length 3 to 12
for j in range(3, min(len(right), 15)):
prefix = right[:j]
if self.analyzer._in_kosha(prefix):
right_valid = True
break
# Sandhi normalization: if prefix ends with long vowel, try short
# AtmA -> Atma, prAtI -> prAti, etc.
if prefix.endswith('A'):
normalized = prefix[:-1] + 'a'
if self.analyzer._in_kosha(normalized):
right_valid = True
break
elif prefix.endswith('I'):
normalized = prefix[:-1] + 'i'
if self.analyzer._in_kosha(normalized):
right_valid = True
break
elif prefix.endswith('U'):
normalized = prefix[:-1] + 'u'
if self.analyzer._in_kosha(normalized):
right_valid = True
break
# If still not found, check known initials
if not right_valid:
for initial in self.COMPOUND_INITIALS + list(self.COMPOUND_FINALS):
if right.startswith(initial) and len(initial) >= 2:
right_valid = True
break
# DEBUG
# if "sopAdhika" in word:
# print(f"Check {left} | {right} -> L:{left_valid} R:{right_valid}")
if left_valid and right_valid:
candidates.append(i)
return candidates
def score_split(components):
# Base: Squared length favors fewer, longer components
score = sum(len(c)**2 for c in components)
# --- PENALTIES ---
for c in components:
if len(c) < 4:
if not self._is_valid_stem(c):
score -= 50
else:
score -= 5
if len(components) > 2:
score -= (len(components) - 2) * 20
# --- BONUSES ---
# 1. VALIDITY BONUS (THE FIX)
# Old value: 30. New value: 100.
# This ensures that 164 (split score) + 200 (bonus) > 289 (garbage score)
valid_count = sum(1 for c in components if self._is_valid_stem(c))
score += valid_count * 100
# 2. SURVIVAL BONUS (Protects rAmo, namaH)
if len(components) == 1:
if self._is_valid_stem(components[0]):
score += 50
# 3. Compound Pattern Bonus
if len(components) >= 2:
left = components[0]
right = components[-1]
if left in self.COMPOUND_INITIALS: score += 15
# Check Right Final
r_stem, _ = self.analyzer._extract_vibhakti(right)
if r_stem in self.COMPOUND_FINALS: score += 25
elif right in self.COMPOUND_FINALS: score += 25
if abs(len(left) - len(right)) <= 1: score += 10
# 4. Expansion penalty (RELAXED)
# We removed the "elif expansion == 0: score += 20" trap.
total_len = sum(len(c) for c in components)
expansion = total_len - len(word)
if expansion > 1:
score -= (expansion - 1) * 25
return score
def split(self, word: str, max_components: int = 4) -> CompoundSplit:
"""
Split a compound word into its components.
Uses greedy algorithm with Kosha validation.
Returns original word if no valid split found.
"""
if len(word) < 4:
return CompoundSplit(
surface=word, components=[word],
split_points=[], is_compound=False, compound_type=None
)
# Check if word itself is in Kosha (might not be compound)
# KEY FIX: If word is already a known stem (lexicalized), DO NOT SPLIT
# This protects 'paramAtma', 'kzetrajYa', 'sopAdhika' from being broken down
if self.analyzer._in_kosha(word):
return CompoundSplit(
surface=word, components=[word],
split_points=[], is_compound=False, compound_type=None
)
# Use RECURSIVE COMPOSITIONAL algorithm
# Tries ALL split points, recursively parses right sides,
# returns parse with MOST valid components
components = self._recursive_split(word)
if len(components) <= 1:
return CompoundSplit(
surface=word, components=[word],
split_points=[], is_compound=False, compound_type=None
)
# Calculate split points from components
split_points = []
pos = 0
for comp in components[:-1]:
pos += len(comp)
split_points.append(pos)
return CompoundSplit(
surface=word, components=components,
split_points=split_points, is_compound=True,
compound_type=None # We don't classify samāsa types
)
def _split_dp(self, word: str, memo: dict = None) -> List[List[str]]:
"""
V4 Algorithm: Memoized Dynamic Programming with Sandhi Expansion.
Returns all valid splits, cached by suffix.
Handles coalescent sandhi (e=a+i, o=a+u, etc.) that V3 misses.
"""
if memo is None:
memo = {}
if word in memo:
return memo[word]
# Base: too short to split
if len(word) <= 2:
if self._is_valid_stem(word):
return [[word]]
return []
valid_splits = []
# 1. OPTION A: The whole word is a stem (Lexicalized)
if self._is_valid_stem(word):
valid_splits.append([word])
# DO NOT RETURN EARLY. Keep looking for splits!
# 2. OPTION B: Split it (Generative Sandhi)
# Try each split position with sandhi expansion
for i in range(2, len(word) - 1):
for left, right in self.sandhi_engine.generate_splits(word, i):
if len(left) < 2 or len(right) < 2:
continue
if self._is_valid_stem(left):
# Recurse on right (memoized!)
right_splits = self._split_dp(right, memo)
for rs in right_splits:
valid_splits.append([left] + rs)
memo[word] = valid_splits
return valid_splits
def split_v4(self, word: str) -> CompoundSplit:
"""
V4 Split: Uses generative sandhi expansion for coalescent sandhi.
Handles:
- Vowel coalescence: gaṇeśa → gaṇa + īśa (e = a+i)
- Visarga sandhi: punarjanma → punaH + janma
- Vṛddhi: tavaiva → tava + eva
"""
if len(word) < 4:
return CompoundSplit(
surface=word, components=[word],
split_points=[], is_compound=False, compound_type=None
)
# Use V4 DP algorithm
all_splits = self._split_dp(word)
if not all_splits:
return CompoundSplit(
surface=word, components=[word],
split_points=[], is_compound=False, compound_type=None
)
# SCORING STRATEGY:
# Balance: prefer splits, but penalize over-fragmentation.
# 1. Penalize short components (< 3 chars) heavily
# 2. Prefer 2-component splits over 3+ components
# 3. Single long tokens get moderate penalty
# V4 Scoring with Compound Pattern Recognition
def score_split(components):
# Base: Squared length favors fewer, longer components
score = sum(len(c)**2 for c in components)
# --- PENALTIES ---
# 1. Short junk penalty (unless it's a valid stem)
for c in components:
if len(c) < 4:
if not self._is_valid_stem(c):
score -= 50 # Garbage fragment
else:
score -= 5 # Valid but short (e.g. 'ISa'), slight penalty
# 2. Fragmentation penalty
if len(components) > 2:
score -= (len(components) - 2) * 30 # Increased penalty
# 3. 2-component bonus (optimal compound structure)
if len(components) == 2:
score += 25
# --- BONUSES ---
# 0. COMMON_WORDS Protection (namaH, rAmo should stay atomic)
if len(components) == 1 and components[0] in self.COMMON_WORDS:
score += 50 # Strong bonus to prevent splitting
# 1. Validity Bonus (Crucial for pataye/rAmo)
# Use _is_valid_stem so declined words get credit
valid_count = sum(1 for c in components if self._is_valid_stem(c))
score += valid_count * 30
# 2. Compound Pattern Bonus (The Fix for gaRapataye)
if len(components) >= 2:
left = components[0]
right = components[-1]
# Check Left against Initials
if left in self.COMPOUND_INITIALS:
score += 15
# Check Right against Finals
# Need to extract stem to match (pataye -> pati)
for final in self.COMPOUND_FINALS:
if right.startswith(final) or right == final:
score += 25 # High bonus for matching pattern like 'pati'
break
# Try stripping vibhakti
if right.endswith('aye') and right[:-3] + 'i' == final:
score += 25
break
if right.endswith('ave') and right[:-3] + 'u' == final:
score += 25
break
# Balance bonus
if abs(len(left) - len(right)) <= 1:
score += 10
# 4. Expansion penalty (sandhi artifacts add characters)
# Allow 1 char expansion for sandhi (e → a+I), only penalize 2+ extra chars
total_len = sum(len(c) for c in components)
expansion = total_len - len(word)
if expansion > 1:
score -= (expansion - 1) * 25 # Stronger penalty
elif expansion == 0:
score += 20 # Bonus for exact-length splits (no sandhi artifact)
return score
best_split = max(all_splits, key=score_split)
if len(best_split) <= 1:
return CompoundSplit(
surface=word, components=[word],
split_points=[], is_compound=False, compound_type=None
)
return CompoundSplit(
surface=word, components=best_split,
split_points=[], is_compound=True, compound_type=None
)
def split_multiple(self, words: List[str]) -> List[CompoundSplit]:
"""Split multiple words."""
return [self.split(w) for w in words]
# --- TEST ---
if __name__ == "__main__":
print("Testing SamasaSplitter...")
splitter = SamasaSplitter()
test_compounds = [
"hfdpadma",
"paramAtma",
"mahArAja",
"devadatta",
"rAjakumAra",
"sopAdhika",
]
for word in test_compounds:
result = splitter.split(word)
if result.is_compound:
print(f" {word:20}{' + '.join(result.components)}")
else:
print(f" {word:20} → (not split)")