Upload src/splitter.py with huggingface_hub
Browse files- src/splitter.py +104 -32
src/splitter.py
CHANGED
|
@@ -52,6 +52,12 @@ class SamasaSplitter:
|
|
| 52 |
"padma", "kamala", # Lotus
|
| 53 |
]
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
|
| 56 |
"""Initialize with optional shared analyzer."""
|
| 57 |
self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
|
|
@@ -133,14 +139,35 @@ class SamasaSplitter:
|
|
| 133 |
def _is_valid_stem(self, surface: str) -> bool:
|
| 134 |
"""
|
| 135 |
Check if a surface form is a valid stem, trying:
|
|
|
|
| 136 |
1. Direct Kosha lookup
|
| 137 |
-
2.
|
| 138 |
-
3.
|
|
|
|
| 139 |
"""
|
| 140 |
if len(surface) < 2:
|
| 141 |
return False
|
| 142 |
|
| 143 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
candidates = self._try_sandhi_reversal(surface)
|
| 145 |
for candidate in candidates:
|
| 146 |
if self.analyzer._in_kosha(candidate):
|
|
@@ -152,18 +179,18 @@ class SamasaSplitter:
|
|
| 152 |
return True
|
| 153 |
if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
|
| 154 |
return True
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
if self.analyzer._in_kosha(base):
|
| 160 |
-
return True
|
| 161 |
|
| 162 |
# Try VIBHAKTI STRIPPING (nominal case endings)
|
| 163 |
VIBHAKTI_ENDINGS = [
|
| 164 |
'am', 'aH', 'ena', 'Aya', 'At', 'asya', 'e', 'AH', # Masculine a-stem
|
| 165 |
'An', 'EH', 'eBya', 'AnAm', 'ezu', # Masculine a-stem plural
|
| 166 |
'au', 'OH', 'AvyAm', # Dual
|
|
|
|
|
|
|
| 167 |
]
|
| 168 |
for ending in sorted(VIBHAKTI_ENDINGS, key=len, reverse=True):
|
| 169 |
if surface.endswith(ending) and len(surface) > len(ending) + 2:
|
|
@@ -173,6 +200,14 @@ class SamasaSplitter:
|
|
| 173 |
# Try with 'a' restoration (munipuMgavam → munipuMgava)
|
| 174 |
if self.analyzer._in_kosha(stem + 'a'):
|
| 175 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
# Try PRATYAYA STRIPPING (grammatical suffix removal)
|
| 178 |
# This is Panini's kRt/taddhita system - generalizes to ALL Sanskrit
|
|
@@ -802,38 +837,75 @@ class SamasaSplitter:
|
|
| 802 |
# 1. Penalize short components (< 3 chars) heavily
|
| 803 |
# 2. Prefer 2-component splits over 3+ components
|
| 804 |
# 3. Single long tokens get moderate penalty
|
| 805 |
-
#
|
| 806 |
def score_split(components):
|
| 807 |
-
|
|
|
|
| 808 |
|
| 809 |
-
#
|
| 810 |
-
|
| 811 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
|
| 813 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 814 |
if len(components) == 2:
|
| 815 |
-
|
| 816 |
|
| 817 |
-
#
|
| 818 |
-
|
| 819 |
-
|
|
|
|
| 820 |
|
| 821 |
-
#
|
| 822 |
-
|
| 823 |
-
|
|
|
|
| 824 |
|
| 825 |
-
#
|
| 826 |
-
if len(components)
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 830 |
|
| 831 |
-
#
|
|
|
|
| 832 |
total_len = sum(len(c) for c in components)
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
|
|
|
|
|
|
|
|
|
| 837 |
|
| 838 |
best_split = max(all_splits, key=score_split)
|
| 839 |
|
|
|
|
| 52 |
"padma", "kamala", # Lotus
|
| 53 |
]
|
| 54 |
|
| 55 |
+
# Hardcoded protection for high-frequency words that might be over-split
|
| 56 |
+
COMMON_WORDS = {
|
| 57 |
+
"namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
|
| 58 |
+
"pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya",
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
|
| 62 |
"""Initialize with optional shared analyzer."""
|
| 63 |
self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
|
|
|
|
| 139 |
def _is_valid_stem(self, surface: str) -> bool:
|
| 140 |
"""
|
| 141 |
Check if a surface form is a valid stem, trying:
|
| 142 |
+
0. COMMON_WORDS protection
|
| 143 |
1. Direct Kosha lookup
|
| 144 |
+
2. Visarga/Anusvara base check (rAmaH → rAma)
|
| 145 |
+
3. Sandhi reversal
|
| 146 |
+
4. Pratyaya (suffix) stripping
|
| 147 |
"""
|
| 148 |
if len(surface) < 2:
|
| 149 |
return False
|
| 150 |
|
| 151 |
+
# 0. Safety Check for Common Words (protect namaH, rAmo, etc.)
|
| 152 |
+
if surface in self.COMMON_WORDS:
|
| 153 |
+
return True
|
| 154 |
+
|
| 155 |
+
# 1. Direct Kosha Check
|
| 156 |
+
if self.analyzer._in_kosha(surface):
|
| 157 |
+
return True
|
| 158 |
+
|
| 159 |
+
# 2. Visarga/Anusvara Check (FIX for rAmaH validation)
|
| 160 |
+
# If sandhi-restored "rAmo" → "rAmaH", accept it if base "rAma" is in kosha
|
| 161 |
+
if surface.endswith('H') and len(surface) > 2:
|
| 162 |
+
base = surface[:-1]
|
| 163 |
+
if self.analyzer._in_kosha(base):
|
| 164 |
+
return True
|
| 165 |
+
if surface.endswith('M') and len(surface) > 2:
|
| 166 |
+
base = surface[:-1]
|
| 167 |
+
if self.analyzer._in_kosha(base):
|
| 168 |
+
return True
|
| 169 |
+
|
| 170 |
+
# 3. Try all Sandhi reversal candidates
|
| 171 |
candidates = self._try_sandhi_reversal(surface)
|
| 172 |
for candidate in candidates:
|
| 173 |
if self.analyzer._in_kosha(candidate):
|
|
|
|
| 179 |
return True
|
| 180 |
if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
|
| 181 |
return True
|
| 182 |
+
# Recursive visarga check for candidates too
|
| 183 |
+
if candidate.endswith('H') and len(candidate) > 2:
|
| 184 |
+
if self.analyzer._in_kosha(candidate[:-1]):
|
| 185 |
+
return True
|
|
|
|
|
|
|
| 186 |
|
| 187 |
# Try VIBHAKTI STRIPPING (nominal case endings)
|
| 188 |
VIBHAKTI_ENDINGS = [
|
| 189 |
'am', 'aH', 'ena', 'Aya', 'At', 'asya', 'e', 'AH', # Masculine a-stem
|
| 190 |
'An', 'EH', 'eBya', 'AnAm', 'ezu', # Masculine a-stem plural
|
| 191 |
'au', 'OH', 'AvyAm', # Dual
|
| 192 |
+
'aye', # i-stem dative (pataye, munaye)
|
| 193 |
+
'ave', # u-stem dative (vizRave, gurave)
|
| 194 |
]
|
| 195 |
for ending in sorted(VIBHAKTI_ENDINGS, key=len, reverse=True):
|
| 196 |
if surface.endswith(ending) and len(surface) > len(ending) + 2:
|
|
|
|
| 200 |
# Try with 'a' restoration (munipuMgavam → munipuMgava)
|
| 201 |
if self.analyzer._in_kosha(stem + 'a'):
|
| 202 |
return True
|
| 203 |
+
|
| 204 |
+
# SPECIAL CASE: 'aye' ending implies 'i' stem (pataye → pati)
|
| 205 |
+
if ending == 'aye' and self.analyzer._in_kosha(stem + 'i'):
|
| 206 |
+
return True
|
| 207 |
+
|
| 208 |
+
# SPECIAL CASE: 'ave' ending implies 'u' stem (gurave → guru)
|
| 209 |
+
if ending == 'ave' and self.analyzer._in_kosha(stem + 'u'):
|
| 210 |
+
return True
|
| 211 |
|
| 212 |
# Try PRATYAYA STRIPPING (grammatical suffix removal)
|
| 213 |
# This is Panini's kRt/taddhita system - generalizes to ALL Sanskrit
|
|
|
|
| 837 |
# 1. Penalize short components (< 3 chars) heavily
|
| 838 |
# 2. Prefer 2-component splits over 3+ components
|
| 839 |
# 3. Single long tokens get moderate penalty
|
| 840 |
+
# V4 Scoring with Compound Pattern Recognition
|
| 841 |
def score_split(components):
|
| 842 |
+
# Base: Squared length favors fewer, longer components
|
| 843 |
+
score = sum(len(c)**2 for c in components)
|
| 844 |
|
| 845 |
+
# --- PENALTIES ---
|
| 846 |
+
# 1. Short junk penalty (unless it's a valid stem)
|
| 847 |
+
for c in components:
|
| 848 |
+
if len(c) < 4:
|
| 849 |
+
if not self._is_valid_stem(c):
|
| 850 |
+
score -= 50 # Garbage fragment
|
| 851 |
+
else:
|
| 852 |
+
score -= 5 # Valid but short (e.g. 'ISa'), slight penalty
|
| 853 |
|
| 854 |
+
# 2. Fragmentation penalty
|
| 855 |
+
if len(components) > 2:
|
| 856 |
+
score -= (len(components) - 2) * 30 # Increased penalty
|
| 857 |
+
|
| 858 |
+
# 3. 2-component bonus (optimal compound structure)
|
| 859 |
if len(components) == 2:
|
| 860 |
+
score += 25
|
| 861 |
|
| 862 |
+
# --- BONUSES ---
|
| 863 |
+
# 0. COMMON_WORDS Protection (namaH, rAmo should stay atomic)
|
| 864 |
+
if len(components) == 1 and components[0] in self.COMMON_WORDS:
|
| 865 |
+
score += 50 # Strong bonus to prevent splitting
|
| 866 |
|
| 867 |
+
# 1. Validity Bonus (Crucial for pataye/rAmo)
|
| 868 |
+
# Use _is_valid_stem so declined words get credit
|
| 869 |
+
valid_count = sum(1 for c in components if self._is_valid_stem(c))
|
| 870 |
+
score += valid_count * 30
|
| 871 |
|
| 872 |
+
# 2. Compound Pattern Bonus (The Fix for gaRapataye)
|
| 873 |
+
if len(components) >= 2:
|
| 874 |
+
left = components[0]
|
| 875 |
+
right = components[-1]
|
| 876 |
+
|
| 877 |
+
# Check Left against Initials
|
| 878 |
+
if left in self.COMPOUND_INITIALS:
|
| 879 |
+
score += 15
|
| 880 |
+
|
| 881 |
+
# Check Right against Finals
|
| 882 |
+
# Need to extract stem to match (pataye -> pati)
|
| 883 |
+
for final in self.COMPOUND_FINALS:
|
| 884 |
+
if right.startswith(final) or right == final:
|
| 885 |
+
score += 25 # High bonus for matching pattern like 'pati'
|
| 886 |
+
break
|
| 887 |
+
# Try stripping vibhakti
|
| 888 |
+
if right.endswith('aye') and right[:-3] + 'i' == final:
|
| 889 |
+
score += 25
|
| 890 |
+
break
|
| 891 |
+
if right.endswith('ave') and right[:-3] + 'u' == final:
|
| 892 |
+
score += 25
|
| 893 |
+
break
|
| 894 |
+
|
| 895 |
+
# Balance bonus
|
| 896 |
+
if abs(len(left) - len(right)) <= 1:
|
| 897 |
+
score += 10
|
| 898 |
|
| 899 |
+
# 4. Expansion penalty (sandhi artifacts add characters)
|
| 900 |
+
# Allow 1 char expansion for sandhi (e → a+I), only penalize 2+ extra chars
|
| 901 |
total_len = sum(len(c) for c in components)
|
| 902 |
+
expansion = total_len - len(word)
|
| 903 |
+
if expansion > 1:
|
| 904 |
+
score -= (expansion - 1) * 25 # Stronger penalty
|
| 905 |
+
elif expansion == 0:
|
| 906 |
+
score += 20 # Bonus for exact-length splits (no sandhi artifact)
|
| 907 |
+
|
| 908 |
+
return score
|
| 909 |
|
| 910 |
best_split = max(all_splits, key=score_split)
|
| 911 |
|