|
|
"""
|
|
|
Samāsa (Compound) Splitter
|
|
|
Detects and splits Sanskrit compound words at their boundaries.
|
|
|
"""
|
|
|
|
|
|
from typing import List, Tuple, Optional
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
|
|
|
from analyzer import VidyutAnalyzer, MorphParse
|
|
|
from sandhi_engine import SandhiEngine
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
class CompoundSplit:
|
|
|
"""Result of compound splitting."""
|
|
|
surface: str
|
|
|
components: List[str]
|
|
|
split_points: List[int]
|
|
|
is_compound: bool
|
|
|
compound_type: Optional[str]
|
|
|
|
|
|
|
|
|
class SamasaSplitter:
|
|
|
"""
|
|
|
Splits Sanskrit compound words (samāsa) at their boundaries.
|
|
|
Uses Kosha lookups to validate potential split points.
|
|
|
"""
|
|
|
|
|
|
|
|
|
COMPOUND_FINALS = [
|
|
|
"kara", "kAra", "kArin", "kft", "kftya",
|
|
|
"gata", "gati", "gamana",
|
|
|
"ja", "jAta", "janman",
|
|
|
"Da", "DAra", "DAraka", "DArin",
|
|
|
"maya", "mat", "vat",
|
|
|
"pati", "nATa", "ISvara", "adhipa",
|
|
|
"Atman", "rUpa", "svarUpa",
|
|
|
"pada", "pAduka",
|
|
|
"stha", "sthita", "sthAna",
|
|
|
"yukta", "hIna", "rahita",
|
|
|
"priya", "rata", "ASrita",
|
|
|
"vid", "jYa", "vadin", "pAla",
|
|
|
"rAja", "indra", "deva", "loka",
|
|
|
"karziR", "AkarziRi","ISa",
|
|
|
]
|
|
|
|
|
|
|
|
|
COMPOUND_INITIALS = [
|
|
|
"mahA", "ati", "su", "dur", "sat", "a", "an",
|
|
|
"sarva", "viSva", "eka", "bahu",
|
|
|
"deva", "brahma", "Atma", "para",
|
|
|
"rAja", "mahI", "loka",
|
|
|
"hfd", "manas", "citta",
|
|
|
"padma", "kamala", "Ananda", "ISa",
|
|
|
]
|
|
|
|
|
|
|
|
|
COMMON_WORDS = {
|
|
|
"namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
|
|
|
"pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya","AkarziRi",
|
|
|
}
|
|
|
|
|
|
def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
|
|
|
"""Initialize with optional shared analyzer."""
|
|
|
self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
|
|
|
self.sandhi_engine = SandhiEngine()
|
|
|
|
|
|
|
|
|
|
|
|
SANDHI_REVERSIONS = {
|
|
|
|
|
|
'd': ['t', 'd'],
|
|
|
'g': ['k', 'g'],
|
|
|
'b': ['p', 'b'],
|
|
|
'D': ['T', 'D'],
|
|
|
'j': ['c', 'j'],
|
|
|
'z': ['s', 'z'],
|
|
|
|
|
|
'A': ['a', 'A'],
|
|
|
'I': ['i', 'I'],
|
|
|
'U': ['u', 'U'],
|
|
|
'e': ['a', 'i'],
|
|
|
'o': ['a', 'u'],
|
|
|
'ai': ['a', 'e'],
|
|
|
'au': ['a', 'o'],
|
|
|
|
|
|
'cC': ['t', 'c'],
|
|
|
'jj': ['d', 'j'],
|
|
|
'DD': ['D', 'D'],
|
|
|
|
|
|
'o': ['aH'],
|
|
|
'ar': ['aH'],
|
|
|
}
|
|
|
|
|
|
def _try_sandhi_reversal(self, surface: str, min_stem_len: int = 3) -> List[str]:
|
|
|
"""
|
|
|
Try to recover original stems from Sandhi-modified surface forms.
|
|
|
Returns list of possible original forms, ordered by likelihood.
|
|
|
"""
|
|
|
candidates = [surface]
|
|
|
|
|
|
|
|
|
|
|
|
TRANSLIT_MAP = [
|
|
|
('bh', 'B'), ('dh', 'D'), ('gh', 'G'), ('ph', 'P'),
|
|
|
('th', 'T'), ('kh', 'K'), ('ch', 'C'), ('jh', 'J'),
|
|
|
('Th', 'W'), ('Dh', 'Q'),
|
|
|
]
|
|
|
normalized = surface
|
|
|
for digraph, single in TRANSLIT_MAP:
|
|
|
normalized = normalized.replace(digraph, single)
|
|
|
if normalized != surface:
|
|
|
candidates.append(normalized)
|
|
|
|
|
|
|
|
|
for form in [surface, normalized]:
|
|
|
if len(form) >= min_stem_len and form[-1] in self.SANDHI_REVERSIONS:
|
|
|
for original in self.SANDHI_REVERSIONS[form[-1]]:
|
|
|
candidate = form[:-1] + original
|
|
|
if candidate not in candidates:
|
|
|
candidates.append(candidate)
|
|
|
|
|
|
|
|
|
|
|
|
for form in [surface, normalized]:
|
|
|
if form.endswith('y') and len(form) >= min_stem_len:
|
|
|
candidates.append(form[:-1] + 'i')
|
|
|
if form.endswith('v') and len(form) >= min_stem_len:
|
|
|
candidates.append(form[:-1] + 'u')
|
|
|
|
|
|
|
|
|
seen = set()
|
|
|
unique = []
|
|
|
for c in candidates:
|
|
|
if c not in seen:
|
|
|
seen.add(c)
|
|
|
unique.append(c)
|
|
|
|
|
|
return unique
|
|
|
|
|
|
def _is_valid_stem(self, surface: str) -> bool:
|
|
|
"""
|
|
|
Check if a surface form is a valid stem, trying:
|
|
|
0. COMMON_WORDS protection
|
|
|
1. Direct Kosha lookup
|
|
|
2. Visarga/Anusvara base check (rAmaH → rAma)
|
|
|
3. Sandhi reversal
|
|
|
4. Pratyaya (suffix) stripping
|
|
|
"""
|
|
|
if len(surface) < 2:
|
|
|
return False
|
|
|
|
|
|
|
|
|
if surface in self.COMMON_WORDS:
|
|
|
return True
|
|
|
|
|
|
|
|
|
if self.analyzer._in_kosha(surface):
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
if surface.endswith('H') and len(surface) > 2:
|
|
|
base = surface[:-1]
|
|
|
if self.analyzer._in_kosha(base):
|
|
|
return True
|
|
|
if surface.endswith('M') and len(surface) > 2:
|
|
|
base = surface[:-1]
|
|
|
if self.analyzer._in_kosha(base):
|
|
|
return True
|
|
|
|
|
|
|
|
|
candidates = self._try_sandhi_reversal(surface)
|
|
|
for candidate in candidates:
|
|
|
if self.analyzer._in_kosha(candidate):
|
|
|
return True
|
|
|
|
|
|
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
|
|
|
return True
|
|
|
if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
|
|
|
return True
|
|
|
if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
|
|
|
return True
|
|
|
|
|
|
if candidate.endswith('H') and len(candidate) > 2:
|
|
|
if self.analyzer._in_kosha(candidate[:-1]):
|
|
|
return True
|
|
|
|
|
|
|
|
|
VIBHAKTI_ENDINGS = [
|
|
|
'am', 'aH', 'ena', 'Aya', 'At', 'asya', 'e', 'AH',
|
|
|
'An', 'EH', 'eBya', 'AnAm', 'ezu',
|
|
|
'au', 'OH', 'AvyAm',
|
|
|
'aye',
|
|
|
'ave',
|
|
|
]
|
|
|
for ending in sorted(VIBHAKTI_ENDINGS, key=len, reverse=True):
|
|
|
if surface.endswith(ending) and len(surface) > len(ending) + 2:
|
|
|
stem = surface[:-len(ending)]
|
|
|
if self.analyzer._in_kosha(stem):
|
|
|
return True
|
|
|
|
|
|
if self.analyzer._in_kosha(stem + 'a'):
|
|
|
return True
|
|
|
|
|
|
|
|
|
if ending == 'aye' and self.analyzer._in_kosha(stem + 'i'):
|
|
|
return True
|
|
|
|
|
|
|
|
|
if ending == 'ave' and self.analyzer._in_kosha(stem + 'u'):
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
PRATYAYAS = [
|
|
|
('ana', 3),
|
|
|
('Ana', 3),
|
|
|
('tva', 3),
|
|
|
('tA', 2),
|
|
|
('ya', 2),
|
|
|
('ta', 2),
|
|
|
('ti', 2),
|
|
|
('in', 2),
|
|
|
('ika', 3),
|
|
|
('Iya', 3),
|
|
|
|
|
|
('iRi', 3),
|
|
|
('iRI', 3),
|
|
|
('inI', 3),
|
|
|
('ikA', 3),
|
|
|
('trI', 3),
|
|
|
]
|
|
|
|
|
|
for suffix, min_root in PRATYAYAS:
|
|
|
if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
|
|
|
root = surface[:-len(suffix)]
|
|
|
|
|
|
if self.analyzer._in_kosha(root):
|
|
|
return True
|
|
|
|
|
|
if self.analyzer._in_kosha(root + 'a'):
|
|
|
return True
|
|
|
|
|
|
root_f = root.replace('R', 'f')
|
|
|
if root_f != root and self.analyzer._in_kosha(root_f):
|
|
|
return True
|
|
|
|
|
|
for r in self._try_sandhi_reversal(root):
|
|
|
if self.analyzer._in_kosha(r):
|
|
|
return True
|
|
|
|
|
|
return False
|
|
|
|
|
|
def _count_kosha_heads(self, surface: str, min_head_len: int = 5) -> int:
|
|
|
"""
|
|
|
FIX 2: Count how many valid kosha stems exist inside a long string.
|
|
|
Used to detect mega-tokens that swallowed multiple stems.
|
|
|
"""
|
|
|
if len(surface) < min_head_len * 2:
|
|
|
return 1 if self._is_valid_stem(surface) else 0
|
|
|
|
|
|
heads = 0
|
|
|
i = 0
|
|
|
while i < len(surface) - min_head_len + 1:
|
|
|
|
|
|
for j in range(min(len(surface), i + 15), i + min_head_len - 1, -1):
|
|
|
candidate = surface[i:j]
|
|
|
if len(candidate) >= min_head_len and self._is_valid_stem(candidate):
|
|
|
heads += 1
|
|
|
i = j
|
|
|
break
|
|
|
else:
|
|
|
i += 1
|
|
|
return max(heads, 1 if self._is_valid_stem(surface) else 0)
|
|
|
|
|
|
def _is_krdanta(self, surface: str) -> bool:
|
|
|
"""
|
|
|
FIX 3: Recognize kṛdanta (verbal derivative) forms.
|
|
|
These should be kept as units, not split further.
|
|
|
|
|
|
Kṛdanta indicators:
|
|
|
- Ends with participial suffix preceded by verbal root
|
|
|
- The whole form is in kosha as a recognized derivative
|
|
|
"""
|
|
|
KRDANTA_SUFFIXES = [
|
|
|
('mAna', 4),
|
|
|
('Ana', 3),
|
|
|
('tavat', 5),
|
|
|
('ta', 2),
|
|
|
('in', 2),
|
|
|
('aka', 3),
|
|
|
('tR', 2),
|
|
|
]
|
|
|
|
|
|
for suffix, min_root in KRDANTA_SUFFIXES:
|
|
|
if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
|
|
|
root = surface[:-len(suffix)]
|
|
|
|
|
|
|
|
|
for candidate in self._try_sandhi_reversal(root):
|
|
|
if self.analyzer._in_kosha(candidate):
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
def _recursive_split(self, word: str, memo: dict = None) -> List[str]:
|
|
|
"""
|
|
|
Recursively split a compound into maximal valid components.
|
|
|
|
|
|
IMPROVED ALGORITHM with three fixes:
|
|
|
1. FIX 1: Derivational spine continuation - keep collapsing if stem+suffix both valid
|
|
|
2. FIX 2: Multi-head splitting - if token has multiple kosha heads, force split
|
|
|
3. FIX 3: Kṛdanta recognition - keep participles as atomic units
|
|
|
|
|
|
Uses memoization to avoid exponential blowup.
|
|
|
"""
|
|
|
if memo is None:
|
|
|
memo = {}
|
|
|
|
|
|
if word in memo:
|
|
|
return memo[word]
|
|
|
|
|
|
|
|
|
if self._is_krdanta(word) and self._is_valid_stem(word):
|
|
|
memo[word] = [word]
|
|
|
return [word]
|
|
|
|
|
|
|
|
|
MAX_TOKEN_LEN = 15
|
|
|
if len(word) > MAX_TOKEN_LEN:
|
|
|
head_count = self._count_kosha_heads(word)
|
|
|
if head_count > 1:
|
|
|
|
|
|
pass
|
|
|
else:
|
|
|
|
|
|
if self._is_valid_stem(word):
|
|
|
memo[word] = [word]
|
|
|
return [word]
|
|
|
else:
|
|
|
|
|
|
if self._is_valid_stem(word):
|
|
|
memo[word] = [word]
|
|
|
return [word]
|
|
|
|
|
|
|
|
|
if len(word) < 4:
|
|
|
memo[word] = [word]
|
|
|
return [word]
|
|
|
|
|
|
best_parse = [word]
|
|
|
best_score = -1000
|
|
|
|
|
|
min_len = 3
|
|
|
|
|
|
|
|
|
for i in range(min_len, len(word) - min_len + 1):
|
|
|
left = word[:i]
|
|
|
right = word[i:]
|
|
|
|
|
|
|
|
|
if self._is_valid_stem(left):
|
|
|
|
|
|
|
|
|
|
|
|
spine_continued = False
|
|
|
for ext_len in range(3, min(len(right) + 1, 8)):
|
|
|
extended = left + right[:ext_len]
|
|
|
if self._is_valid_stem(extended):
|
|
|
|
|
|
spine_continued = True
|
|
|
break
|
|
|
|
|
|
|
|
|
if spine_continued and len(left) < 10:
|
|
|
continue
|
|
|
|
|
|
|
|
|
right_parse = self._recursive_split(right, memo)
|
|
|
|
|
|
|
|
|
full_parse = [left] + right_parse
|
|
|
valid_count = sum(1 for comp in full_parse if self._is_valid_stem(comp))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
num_components = len(full_parse)
|
|
|
avg_len = sum(len(c) for c in full_parse) / num_components
|
|
|
short_penalty = sum(1 for c in full_parse if len(c) < 5)
|
|
|
|
|
|
|
|
|
direct_kosha_bonus = sum(10 for c in full_parse
|
|
|
if self.analyzer._in_kosha(c) or
|
|
|
any(self.analyzer._in_kosha(x) for x in self._try_sandhi_reversal(c)))
|
|
|
|
|
|
|
|
|
score = (valid_count * 100
|
|
|
- num_components * 15
|
|
|
+ avg_len * 5
|
|
|
- short_penalty * 40
|
|
|
+ direct_kosha_bonus)
|
|
|
|
|
|
if score > best_score:
|
|
|
best_score = score
|
|
|
best_parse = full_parse
|
|
|
|
|
|
memo[word] = best_parse
|
|
|
return best_parse
|
|
|
|
|
|
def _longest_left_split(self, word: str) -> Optional[Tuple[str, str]]:
|
|
|
"""
|
|
|
Find the longest valid left stem greedily WITH SANDHI REVERSAL.
|
|
|
|
|
|
For unknown prefixes, tries consonant/vowel Sandhi reversions:
|
|
|
- vidyud -> vidyut (d -> t before vowel)
|
|
|
- buddhy -> buddhi (y -> i for elided vowel)
|
|
|
"""
|
|
|
min_len = 3
|
|
|
|
|
|
|
|
|
for i in range(len(word) - min_len, min_len - 1, -1):
|
|
|
left = word[:i]
|
|
|
right = word[i:]
|
|
|
|
|
|
|
|
|
left_valid = False
|
|
|
left_candidates = self._try_sandhi_reversal(left)
|
|
|
for candidate in left_candidates:
|
|
|
if self.analyzer._in_kosha(candidate):
|
|
|
left_valid = True
|
|
|
break
|
|
|
|
|
|
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
|
|
|
left_valid = True
|
|
|
break
|
|
|
if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
|
|
|
left_valid = True
|
|
|
break
|
|
|
if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
|
|
|
left_valid = True
|
|
|
break
|
|
|
|
|
|
if left_valid and len(right) >= min_len:
|
|
|
|
|
|
right_valid = False
|
|
|
right_candidates = self._try_sandhi_reversal(right)
|
|
|
for candidate in right_candidates:
|
|
|
if self.analyzer._in_kosha(candidate):
|
|
|
right_valid = True
|
|
|
break
|
|
|
|
|
|
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
|
|
|
right_valid = True
|
|
|
break
|
|
|
|
|
|
|
|
|
if not right_valid:
|
|
|
for j in range(min_len, min(len(right), 15)):
|
|
|
prefix = right[:j]
|
|
|
|
|
|
prefix_candidates = self._try_sandhi_reversal(prefix)
|
|
|
for candidate in prefix_candidates:
|
|
|
if self.analyzer._in_kosha(candidate):
|
|
|
right_valid = True
|
|
|
break
|
|
|
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
|
|
|
right_valid = True
|
|
|
break
|
|
|
if right_valid:
|
|
|
break
|
|
|
|
|
|
|
|
|
if not right_valid and left.endswith('A') and right[0] not in 'aAiIuUeEoO':
|
|
|
restored = 'A' + right
|
|
|
restored_candidates = self._try_sandhi_reversal(restored)
|
|
|
for candidate in restored_candidates:
|
|
|
if self.analyzer._in_kosha(candidate):
|
|
|
right_valid = True
|
|
|
break
|
|
|
if not right_valid:
|
|
|
for j in range(min_len, min(len(restored), 12)):
|
|
|
if self.analyzer._in_kosha(restored[:j]):
|
|
|
right_valid = True
|
|
|
break
|
|
|
|
|
|
if right_valid:
|
|
|
return (left, right)
|
|
|
|
|
|
return None
|
|
|
|
|
|
def _find_split_candidates(self, word: str) -> List[int]:
|
|
|
"""Find potential split points based on stem cache validation."""
|
|
|
candidates = []
|
|
|
min_component = 2
|
|
|
|
|
|
|
|
|
ENDINGS = ["M", "H", "aM", "am", "aH", "At", "ena", "Aya", "asya",
|
|
|
"e", "O", "AnAm", "A", "I", "U", "AN", "An", "i"]
|
|
|
|
|
|
for i in range(min_component, len(word) - min_component + 1):
|
|
|
left = word[:i]
|
|
|
right = word[i:]
|
|
|
|
|
|
|
|
|
left_valid = self.analyzer._in_kosha(left)
|
|
|
if not left_valid:
|
|
|
for suffix in ["a", "A", "i", "I", "u", "U"]:
|
|
|
if self.analyzer._in_kosha(left + suffix):
|
|
|
left_valid = True
|
|
|
break
|
|
|
|
|
|
if not left_valid and left.endswith('A'):
|
|
|
if self.analyzer._in_kosha(left[:-1] + 'a'):
|
|
|
left_valid = True
|
|
|
if not left_valid and left.endswith('I'):
|
|
|
if self.analyzer._in_kosha(left[:-1] + 'i'):
|
|
|
left_valid = True
|
|
|
if not left_valid and left.endswith('U'):
|
|
|
if self.analyzer._in_kosha(left[:-1] + 'u'):
|
|
|
left_valid = True
|
|
|
|
|
|
|
|
|
right_valid = self.analyzer._in_kosha(right)
|
|
|
if not right_valid:
|
|
|
|
|
|
for ending in sorted(ENDINGS, key=len, reverse=True):
|
|
|
if right.endswith(ending) and len(right) > len(ending) + 1:
|
|
|
stripped = right[:-len(ending)]
|
|
|
if self.analyzer._in_kosha(stripped):
|
|
|
right_valid = True
|
|
|
break
|
|
|
|
|
|
for suffix in ["a", "A"]:
|
|
|
if self.analyzer._in_kosha(stripped + suffix):
|
|
|
right_valid = True
|
|
|
break
|
|
|
if right_valid:
|
|
|
break
|
|
|
|
|
|
if not right_valid:
|
|
|
|
|
|
for suffix in ["a", "A", "i", "I"]:
|
|
|
if self.analyzer._in_kosha(right + suffix):
|
|
|
right_valid = True
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not right_valid and len(right) > 2:
|
|
|
|
|
|
if left.endswith('A') and right[0] not in 'aAiIuUeEoO':
|
|
|
|
|
|
restored = 'A' + right
|
|
|
if self.analyzer._in_kosha(restored):
|
|
|
right_valid = True
|
|
|
elif len(restored) > 3:
|
|
|
|
|
|
for j in range(3, min(len(restored), 12)):
|
|
|
if self.analyzer._in_kosha(restored[:j]):
|
|
|
right_valid = True
|
|
|
break
|
|
|
elif left.endswith('I') and right[0] not in 'aAiIuUeEoO':
|
|
|
restored = 'I' + right
|
|
|
if self.analyzer._in_kosha(restored):
|
|
|
right_valid = True
|
|
|
elif left.endswith('U') and right[0] not in 'aAiIuUeEoO':
|
|
|
restored = 'U' + right
|
|
|
if self.analyzer._in_kosha(restored):
|
|
|
right_valid = True
|
|
|
|
|
|
|
|
|
if not right_valid and len(right) > 3:
|
|
|
|
|
|
|
|
|
for j in range(3, min(len(right), 15)):
|
|
|
prefix = right[:j]
|
|
|
if self.analyzer._in_kosha(prefix):
|
|
|
right_valid = True
|
|
|
break
|
|
|
|
|
|
|
|
|
if prefix.endswith('A'):
|
|
|
normalized = prefix[:-1] + 'a'
|
|
|
if self.analyzer._in_kosha(normalized):
|
|
|
right_valid = True
|
|
|
break
|
|
|
elif prefix.endswith('I'):
|
|
|
normalized = prefix[:-1] + 'i'
|
|
|
if self.analyzer._in_kosha(normalized):
|
|
|
right_valid = True
|
|
|
break
|
|
|
elif prefix.endswith('U'):
|
|
|
normalized = prefix[:-1] + 'u'
|
|
|
if self.analyzer._in_kosha(normalized):
|
|
|
right_valid = True
|
|
|
break
|
|
|
|
|
|
|
|
|
if not right_valid:
|
|
|
for initial in self.COMPOUND_INITIALS + list(self.COMPOUND_FINALS):
|
|
|
if right.startswith(initial) and len(initial) >= 2:
|
|
|
right_valid = True
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if left_valid and right_valid:
|
|
|
candidates.append(i)
|
|
|
|
|
|
return candidates
|
|
|
|
|
|
def score_split(components):
|
|
|
|
|
|
score = sum(len(c)**2 for c in components)
|
|
|
|
|
|
|
|
|
for c in components:
|
|
|
if len(c) < 4:
|
|
|
if not self._is_valid_stem(c):
|
|
|
score -= 50
|
|
|
else:
|
|
|
score -= 5
|
|
|
|
|
|
if len(components) > 2:
|
|
|
score -= (len(components) - 2) * 20
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
valid_count = sum(1 for c in components if self._is_valid_stem(c))
|
|
|
score += valid_count * 100
|
|
|
|
|
|
|
|
|
if len(components) == 1:
|
|
|
if self._is_valid_stem(components[0]):
|
|
|
score += 50
|
|
|
|
|
|
|
|
|
if len(components) >= 2:
|
|
|
left = components[0]
|
|
|
right = components[-1]
|
|
|
|
|
|
if left in self.COMPOUND_INITIALS: score += 15
|
|
|
|
|
|
|
|
|
r_stem, _ = self.analyzer._extract_vibhakti(right)
|
|
|
if r_stem in self.COMPOUND_FINALS: score += 25
|
|
|
elif right in self.COMPOUND_FINALS: score += 25
|
|
|
|
|
|
if abs(len(left) - len(right)) <= 1: score += 10
|
|
|
|
|
|
|
|
|
total_len = sum(len(c) for c in components)
|
|
|
expansion = total_len - len(word)
|
|
|
if expansion > 1:
|
|
|
score -= (expansion - 1) * 25
|
|
|
return score
|
|
|
|
|
|
def split(self, word: str, max_components: int = 4) -> CompoundSplit:
|
|
|
"""
|
|
|
Split a compound word into its components.
|
|
|
|
|
|
Uses greedy algorithm with Kosha validation.
|
|
|
Returns original word if no valid split found.
|
|
|
"""
|
|
|
if len(word) < 4:
|
|
|
return CompoundSplit(
|
|
|
surface=word, components=[word],
|
|
|
split_points=[], is_compound=False, compound_type=None
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.analyzer._in_kosha(word):
|
|
|
return CompoundSplit(
|
|
|
surface=word, components=[word],
|
|
|
split_points=[], is_compound=False, compound_type=None
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
components = self._recursive_split(word)
|
|
|
|
|
|
if len(components) <= 1:
|
|
|
return CompoundSplit(
|
|
|
surface=word, components=[word],
|
|
|
split_points=[], is_compound=False, compound_type=None
|
|
|
)
|
|
|
|
|
|
|
|
|
split_points = []
|
|
|
pos = 0
|
|
|
for comp in components[:-1]:
|
|
|
pos += len(comp)
|
|
|
split_points.append(pos)
|
|
|
|
|
|
return CompoundSplit(
|
|
|
surface=word, components=components,
|
|
|
split_points=split_points, is_compound=True,
|
|
|
compound_type=None
|
|
|
)
|
|
|
|
|
|
def _split_dp(self, word: str, memo: dict = None) -> List[List[str]]:
|
|
|
"""
|
|
|
V4 Algorithm: Memoized Dynamic Programming with Sandhi Expansion.
|
|
|
|
|
|
Returns all valid splits, cached by suffix.
|
|
|
Handles coalescent sandhi (e=a+i, o=a+u, etc.) that V3 misses.
|
|
|
"""
|
|
|
if memo is None:
|
|
|
memo = {}
|
|
|
|
|
|
if word in memo:
|
|
|
return memo[word]
|
|
|
|
|
|
|
|
|
if len(word) <= 2:
|
|
|
if self._is_valid_stem(word):
|
|
|
return [[word]]
|
|
|
return []
|
|
|
|
|
|
valid_splits = []
|
|
|
|
|
|
|
|
|
if self._is_valid_stem(word):
|
|
|
valid_splits.append([word])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(2, len(word) - 1):
|
|
|
for left, right in self.sandhi_engine.generate_splits(word, i):
|
|
|
if len(left) < 2 or len(right) < 2:
|
|
|
continue
|
|
|
|
|
|
if self._is_valid_stem(left):
|
|
|
|
|
|
right_splits = self._split_dp(right, memo)
|
|
|
for rs in right_splits:
|
|
|
valid_splits.append([left] + rs)
|
|
|
|
|
|
memo[word] = valid_splits
|
|
|
return valid_splits
|
|
|
|
|
|
def split_v4(self, word: str) -> CompoundSplit:
|
|
|
"""
|
|
|
V4 Split: Uses generative sandhi expansion for coalescent sandhi.
|
|
|
|
|
|
Handles:
|
|
|
- Vowel coalescence: gaṇeśa → gaṇa + īśa (e = a+i)
|
|
|
- Visarga sandhi: punarjanma → punaH + janma
|
|
|
- Vṛddhi: tavaiva → tava + eva
|
|
|
"""
|
|
|
if len(word) < 4:
|
|
|
return CompoundSplit(
|
|
|
surface=word, components=[word],
|
|
|
split_points=[], is_compound=False, compound_type=None
|
|
|
)
|
|
|
|
|
|
|
|
|
all_splits = self._split_dp(word)
|
|
|
|
|
|
if not all_splits:
|
|
|
return CompoundSplit(
|
|
|
surface=word, components=[word],
|
|
|
split_points=[], is_compound=False, compound_type=None
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def score_split(components):
|
|
|
|
|
|
score = sum(len(c)**2 for c in components)
|
|
|
|
|
|
|
|
|
|
|
|
for c in components:
|
|
|
if len(c) < 4:
|
|
|
if not self._is_valid_stem(c):
|
|
|
score -= 50
|
|
|
else:
|
|
|
score -= 5
|
|
|
|
|
|
|
|
|
if len(components) > 2:
|
|
|
score -= (len(components) - 2) * 30
|
|
|
|
|
|
|
|
|
if len(components) == 2:
|
|
|
score += 25
|
|
|
|
|
|
|
|
|
|
|
|
if len(components) == 1 and components[0] in self.COMMON_WORDS:
|
|
|
score += 50
|
|
|
|
|
|
|
|
|
|
|
|
valid_count = sum(1 for c in components if self._is_valid_stem(c))
|
|
|
score += valid_count * 30
|
|
|
|
|
|
|
|
|
if len(components) >= 2:
|
|
|
left = components[0]
|
|
|
right = components[-1]
|
|
|
|
|
|
|
|
|
if left in self.COMPOUND_INITIALS:
|
|
|
score += 15
|
|
|
|
|
|
|
|
|
|
|
|
for final in self.COMPOUND_FINALS:
|
|
|
if right.startswith(final) or right == final:
|
|
|
score += 25
|
|
|
break
|
|
|
|
|
|
if right.endswith('aye') and right[:-3] + 'i' == final:
|
|
|
score += 25
|
|
|
break
|
|
|
if right.endswith('ave') and right[:-3] + 'u' == final:
|
|
|
score += 25
|
|
|
break
|
|
|
|
|
|
|
|
|
if abs(len(left) - len(right)) <= 1:
|
|
|
score += 10
|
|
|
|
|
|
|
|
|
|
|
|
total_len = sum(len(c) for c in components)
|
|
|
expansion = total_len - len(word)
|
|
|
if expansion > 1:
|
|
|
score -= (expansion - 1) * 25
|
|
|
elif expansion == 0:
|
|
|
score += 20
|
|
|
|
|
|
return score
|
|
|
|
|
|
best_split = max(all_splits, key=score_split)
|
|
|
|
|
|
if len(best_split) <= 1:
|
|
|
return CompoundSplit(
|
|
|
surface=word, components=[word],
|
|
|
split_points=[], is_compound=False, compound_type=None
|
|
|
)
|
|
|
|
|
|
return CompoundSplit(
|
|
|
surface=word, components=best_split,
|
|
|
split_points=[], is_compound=True, compound_type=None
|
|
|
)
|
|
|
|
|
|
def split_multiple(self, words: List[str]) -> List[CompoundSplit]:
|
|
|
"""Split multiple words."""
|
|
|
return [self.split(w) for w in words]
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
print("Testing SamasaSplitter...")
|
|
|
splitter = SamasaSplitter()
|
|
|
|
|
|
test_compounds = [
|
|
|
"hfdpadma",
|
|
|
"paramAtma",
|
|
|
"mahArAja",
|
|
|
"devadatta",
|
|
|
"rAjakumAra",
|
|
|
"sopAdhika",
|
|
|
]
|
|
|
|
|
|
for word in test_compounds:
|
|
|
result = splitter.split(word)
|
|
|
if result.is_compound:
|
|
|
print(f" {word:20} → {' + '.join(result.components)}")
|
|
|
else:
|
|
|
print(f" {word:20} → (not split)")
|
|
|
|