ArthaLabs
/

panini-tokenizer

@@ -29,17 +29,20 @@ class SamasaSplitter:
     # Common compound final elements (uttarapada patterns)
     COMPOUND_FINALS = [
-        "kara", "kAra", "kArin", "kft", "kftya",  # Doer
-        "gata", "gati", "gamana",                  # Going
-        "ja", "jAta", "janman",                    # Born
-        "Da", "DAra", "DAraka", "DArin",           # Holding
-        "maya", "mat", "vat",                      # Having/made of
-        "pati", "nATa", "ISvara", "adhipa",        # Lord
-        "Atman", "rUpa", "svarUpa",                # Self/form
-        "pada", "pAduka",                          # Foot/step
-        "stha", "sthita", "sthAna",                # Standing/place
-        "yukta", "hIna", "rahita",                 # With/without
-        "priya", "rata", "ASrita",                 # Loving/devoted
     ]
     # Common compound first elements (pūrvapada patterns)
@@ -49,13 +52,13 @@ class SamasaSplitter:
         "deva", "brahma", "Atma", "para",              # Divine/supreme
         "rAja", "mahI", "loka",                        # King/earth/world
         "hfd", "manas", "citta",                       # Heart/mind
-        "padma", "kamala",                             # Lotus
     ]
     # Hardcoded protection for high-frequency words that might be over-split
     COMMON_WORDS = {
         "namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
-        "pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya",
     }
     def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
@@ -613,112 +616,54 @@ class SamasaSplitter:
         return candidates
-    def _score_split(self, left: str, right: str) -> float:
-        """
-        Score a potential split point. Lower is better.
-        Critically tuned to avoid over-segmentation like 'padma' -> 'pad' + 'ma'
-        """
-        score = 0.0
-        # PENALIZE SHORT COMPONENTS
-        # Critical tuning:
-        # < 3 chars (1, 2) -> Heavy penalty (prevent 'ma', 'ka', 'sa')
-        # == 3 chars -> Slight penalty (allow 'hfd', 'gam', 'vid' but prefer longer)
-        if len(left) < 3: score += 5.0
-        elif len(left) == 3: score += 1.0
-        if len(right) < 3: score += 5.0
-        elif len(right) == 3: score += 1.0
-        # PREFER LONGER LEFT COMPONENT (Greedy Match)
-        # Previously we subtracted total len which was constant.
-        # Now we reward taking a bigger bite from the left.
-        # Increased to 1.0 to strongly prefer longer valid stems and overwhelm false matches
-        score -= len(left) * 1.0
-        # Prefer balanced splits (secondary factor)
-        # Reduced influence to let greedy match dominate
-        len_diff = abs(len(left) - len(right))
-        score += len_diff * 0.02
-        # Verify strict Kosha existence
-        left_valid = self.analyzer._in_kosha(left)
-        # Sandhi normalization for left: if ends with long vowel, try short
-        if not left_valid and left.endswith('A'):
-            if self.analyzer._in_kosha(left[:-1] + 'a'):
-                left_valid = True
-        if not left_valid and left.endswith('I'):
-            if self.analyzer._in_kosha(left[:-1] + 'i'):
-                left_valid = True
-        if not left_valid and left.endswith('U'):
-            if self.analyzer._in_kosha(left[:-1] + 'u'):
-                left_valid = True
-        right_valid = self.analyzer._in_kosha(right)
-        # Recursive Lookahead for Right side scoring
-        # If right matches a prefix, consider it valid (don't penalize)
-        if not right_valid and len(right) > 3:
-             for j in range(3, min(len(right), 15)):
-                prefix = right[:j]
-                if self.analyzer._in_kosha(prefix):
-                    right_valid = True
-                    break
-                # Sandhi normalization: if prefix ends with long vowel, try short
-                if prefix.endswith('A'):
-                    normalized = prefix[:-1] + 'a'
-                    if self.analyzer._in_kosha(normalized):
-                        right_valid = True
-                        break
-                elif prefix.endswith('I'):
-                    normalized = prefix[:-1] + 'i'
-                    if self.analyzer._in_kosha(normalized):
-                        right_valid = True
-                        break
-                elif prefix.endswith('U'):
-                    normalized = prefix[:-1] + 'u'
-                    if self.analyzer._in_kosha(normalized):
-                        right_valid = True
-                        break
-        # Sandhi vowel restoration for right side
-        # If left ends with long vowel & right starts with consonant,
-        # try prepending the absorbed vowel
-        if not right_valid and len(right) > 2:
-            if left.endswith('A') and right[0] not in 'aAiIuUeEoO':
-                restored = 'A' + right
-                if self.analyzer._in_kosha(restored):
-                    right_valid = True
-                elif len(restored) > 3:
-                    for j in range(3, min(len(restored), 12)):
-                        if self.analyzer._in_kosha(restored[:j]):
-                            right_valid = True
-                            break
-            elif left.endswith('I') and right[0] not in 'aAiIuUeEoO':
-                restored = 'I' + right
-                if self.analyzer._in_kosha(restored):
-                    right_valid = True
-            elif left.endswith('U') and right[0] not in 'aAiIuUeEoO':
-                restored = 'U' + right
-                if self.analyzer._in_kosha(restored):
-                    right_valid = True
-        # If components are NOT in cache, heavily penalize
-        if not left_valid: score += 10.0
-        if not right_valid: score += 10.0
-        # Bonus for known compound patterns
-        for final in self.COMPOUND_FINALS:
-            if right.startswith(final) or right == final:
-                score -= 2.0  # Stronger bonus
-                break
-        for initial in self.COMPOUND_INITIALS:
-            if left == initial or left.startswith(initial):
-                score -= 2.0  # Stronger bonus
-                break
-        return score
     def split(self, word: str, max_components: int = 4) -> CompoundSplit:
         """

     # Common compound final elements (uttarapada patterns)
     COMPOUND_FINALS = [
+        "kara", "kAra", "kArin", "kft", "kftya",
+        "gata", "gati", "gamana",
+        "ja", "jAta", "janman",
+        "Da", "DAra", "DAraka", "DArin",
+        "maya", "mat", "vat",
+        "pati", "nATa", "ISvara", "adhipa",
+        "Atman", "rUpa", "svarUpa",
+        "pada", "pAduka",
+        "stha", "sthita", "sthAna",
+        "yukta", "hIna", "rahita",
+        "priya", "rata", "ASrita",
+        "vid", "jYa", "vadin", "pAla",
+        "rAja", "indra", "deva", "loka",
+        "karziR", "AkarziRi","ISa",              # Loving/devoted
     ]
     # Common compound first elements (pūrvapada patterns)
         "deva", "brahma", "Atma", "para",              # Divine/supreme
         "rAja", "mahI", "loka",                        # King/earth/world
         "hfd", "manas", "citta",                       # Heart/mind
+        "padma", "kamala", "Ananda", "ISa",                            # Lotus
     ]
     # Hardcoded protection for high-frequency words that might be over-split
     COMMON_WORDS = {
         "namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
+        "pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya","AkarziRi",
     }
     def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
         return candidates
+    def score_split(components):
+            # Base: Squared length favors fewer, longer components
+            score = sum(len(c)**2 for c in components)
+            # --- PENALTIES ---
+            for c in components:
+                if len(c) < 4:
+                    if not self._is_valid_stem(c):
+                        score -= 50
+                    else:
+                        score -= 5
+            if len(components) > 2:
+                score -= (len(components) - 2) * 20
+            # --- BONUSES ---
+            # 1. VALIDITY BONUS (THE FIX)
+            # Old value: 30. New value: 100.
+            # This ensures that 164 (split score) + 200 (bonus) > 289 (garbage score)
+            valid_count = sum(1 for c in components if self._is_valid_stem(c))
+            score += valid_count * 100
+            # 2. SURVIVAL BONUS (Protects rAmo, namaH)
+            if len(components) == 1:
+                if self._is_valid_stem(components[0]):
+                    score += 50
+            # 3. Compound Pattern Bonus
+            if len(components) >= 2:
+                left = components[0]
+                right = components[-1]
+                if left in self.COMPOUND_INITIALS: score += 15
+                # Check Right Final
+                r_stem, _ = self.analyzer._extract_vibhakti(right)
+                if r_stem in self.COMPOUND_FINALS: score += 25
+                elif right in self.COMPOUND_FINALS: score += 25
+                if abs(len(left) - len(right)) <= 1: score += 10
+            # 4. Expansion penalty (RELAXED)
+            # We removed the "elif expansion == 0: score += 20" trap.
+            total_len = sum(len(c) for c in components)
+            expansion = total_len - len(word)
+            if expansion > 1:
+                score -= (expansion - 1) * 25
+            return score
     def split(self, word: str, max_components: int = 4) -> CompoundSplit:
         """