ArthaLabs
/

panini-tokenizer

@@ -52,6 +52,12 @@ class SamasaSplitter:
         "padma", "kamala",                             # Lotus
     ]
     def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
         """Initialize with optional shared analyzer."""
         self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
@@ -133,14 +139,35 @@ class SamasaSplitter:
     def _is_valid_stem(self, surface: str) -> bool:
         """
         Check if a surface form is a valid stem, trying:
         1. Direct Kosha lookup
-        2. Sandhi reversal
-        3. Pratyaya (suffix) stripping
         """
         if len(surface) < 2:
             return False
-        # Try all Sandhi reversal candidates
         candidates = self._try_sandhi_reversal(surface)
         for candidate in candidates:
             if self.analyzer._in_kosha(candidate):
@@ -152,18 +179,18 @@ class SamasaSplitter:
                 return True
             if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
                 return True
-        # Try VISARGA STRIPPING (vAlmIkiH → vAlmIki)
-        if surface.endswith('H') and len(surface) > 2:
-            base = surface[:-1]
-            if self.analyzer._in_kosha(base):
-                return True
         # Try VIBHAKTI STRIPPING (nominal case endings)
         VIBHAKTI_ENDINGS = [
             'am', 'aH', 'ena', 'Aya', 'At', 'asya', 'e', 'AH',  # Masculine a-stem
             'An', 'EH', 'eBya', 'AnAm', 'ezu',                   # Masculine a-stem plural
             'au', 'OH', 'AvyAm',                                  # Dual
         ]
         for ending in sorted(VIBHAKTI_ENDINGS, key=len, reverse=True):
             if surface.endswith(ending) and len(surface) > len(ending) + 2:
@@ -173,6 +200,14 @@ class SamasaSplitter:
                 # Try with 'a' restoration (munipuMgavam → munipuMgava)
                 if self.analyzer._in_kosha(stem + 'a'):
                     return True
         # Try PRATYAYA STRIPPING (grammatical suffix removal)
         # This is Panini's kRt/taddhita system - generalizes to ALL Sanskrit
@@ -802,38 +837,75 @@ class SamasaSplitter:
         # 1. Penalize short components (< 3 chars) heavily
         # 2. Prefer 2-component splits over 3+ components
         # 3. Single long tokens get moderate penalty
-        # 4. Bonus for components directly in kosha (prefer mahA+rAja over maha+arAja)
         def score_split(components):
-            base_score = sum(len(c)**2 for c in components)
-            # Penalize short components (garbage like 'ma', 'at')
-            short_penalty = sum(10 for c in components if len(c) < 3)
-            base_score -= short_penalty * 5
-            # Bonus for 2-component splits (optimal granularity)
             if len(components) == 2:
-                base_score += 20
-            # Penalty for single long tokens (prefer analysis)
-            if len(components) == 1 and len(components[0]) > 6:
-                base_score -= 15
-            # Bonus for components directly in kosha (prefer clean stems)
-            kosha_bonus = sum(25 for c in components if self.analyzer._in_kosha(c))
-            base_score += kosha_bonus
-            # Prefer balanced splits (similar length components)
-            if len(components) == 2:
-                len_diff = abs(len(components[0]) - len(components[1]))
-                if len_diff <= 1:
-                    base_score += 10  # Bonus for balanced split
-            # Penalize splits with expanded length (sandhi artifacts add characters)
             total_len = sum(len(c) for c in components)
-            if total_len > len(word):
-                base_score -= (total_len - len(word)) * 10  # Penalty per extra char
-            return base_score
         best_split = max(all_splits, key=score_split)

         "padma", "kamala",                             # Lotus
     ]
+    # Hardcoded protection for high-frequency words that might be over-split
+    COMMON_WORDS = {
+        "namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
+        "pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya",
+    }
     def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
         """Initialize with optional shared analyzer."""
         self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
     def _is_valid_stem(self, surface: str) -> bool:
         """
         Check if a surface form is a valid stem, trying:
+        0. COMMON_WORDS protection
         1. Direct Kosha lookup
+        2. Visarga/Anusvara base check (rAmaH → rAma)
+        3. Sandhi reversal
+        4. Pratyaya (suffix) stripping
         """
         if len(surface) < 2:
             return False
+        # 0. Safety Check for Common Words (protect namaH, rAmo, etc.)
+        if surface in self.COMMON_WORDS:
+            return True
+        # 1. Direct Kosha Check
+        if self.analyzer._in_kosha(surface):
+            return True
+        # 2. Visarga/Anusvara Check (FIX for rAmaH validation)
+        # If sandhi-restored "rAmo" → "rAmaH", accept it if base "rAma" is in kosha
+        if surface.endswith('H') and len(surface) > 2:
+            base = surface[:-1]
+            if self.analyzer._in_kosha(base):
+                return True
+        if surface.endswith('M') and len(surface) > 2:
+            base = surface[:-1]
+            if self.analyzer._in_kosha(base):
+                return True
+        # 3. Try all Sandhi reversal candidates
         candidates = self._try_sandhi_reversal(surface)
         for candidate in candidates:
             if self.analyzer._in_kosha(candidate):
                 return True
             if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
                 return True
+            # Recursive visarga check for candidates too
+            if candidate.endswith('H') and len(candidate) > 2:
+                if self.analyzer._in_kosha(candidate[:-1]):
+                    return True
         # Try VIBHAKTI STRIPPING (nominal case endings)
         VIBHAKTI_ENDINGS = [
             'am', 'aH', 'ena', 'Aya', 'At', 'asya', 'e', 'AH',  # Masculine a-stem
             'An', 'EH', 'eBya', 'AnAm', 'ezu',                   # Masculine a-stem plural
             'au', 'OH', 'AvyAm',                                  # Dual
+            'aye',                                                 # i-stem dative (pataye, munaye)
+            'ave',                                                 # u-stem dative (vizRave, gurave)
         ]
         for ending in sorted(VIBHAKTI_ENDINGS, key=len, reverse=True):
             if surface.endswith(ending) and len(surface) > len(ending) + 2:
                 # Try with 'a' restoration (munipuMgavam → munipuMgava)
                 if self.analyzer._in_kosha(stem + 'a'):
                     return True
+                # SPECIAL CASE: 'aye' ending implies 'i' stem (pataye → pati)
+                if ending == 'aye' and self.analyzer._in_kosha(stem + 'i'):
+                    return True
+                # SPECIAL CASE: 'ave' ending implies 'u' stem (gurave → guru)
+                if ending == 'ave' and self.analyzer._in_kosha(stem + 'u'):
+                    return True
         # Try PRATYAYA STRIPPING (grammatical suffix removal)
         # This is Panini's kRt/taddhita system - generalizes to ALL Sanskrit
         # 1. Penalize short components (< 3 chars) heavily
         # 2. Prefer 2-component splits over 3+ components
         # 3. Single long tokens get moderate penalty
+        # V4 Scoring with Compound Pattern Recognition
         def score_split(components):
+            # Base: Squared length favors fewer, longer components
+            score = sum(len(c)**2 for c in components)
+            # --- PENALTIES ---
+            # 1. Short junk penalty (unless it's a valid stem)
+            for c in components:
+                if len(c) < 4:
+                    if not self._is_valid_stem(c):
+                        score -= 50  # Garbage fragment
+                    else:
+                        score -= 5   # Valid but short (e.g. 'ISa'), slight penalty
+            # 2. Fragmentation penalty
+            if len(components) > 2:
+                score -= (len(components) - 2) * 30  # Increased penalty
+            # 3. 2-component bonus (optimal compound structure)
             if len(components) == 2:
+                score += 25
+            # --- BONUSES ---
+            # 0. COMMON_WORDS Protection (namaH, rAmo should stay atomic)
+            if len(components) == 1 and components[0] in self.COMMON_WORDS:
+                score += 50  # Strong bonus to prevent splitting
+            # 1. Validity Bonus (Crucial for pataye/rAmo)
+            # Use _is_valid_stem so declined words get credit
+            valid_count = sum(1 for c in components if self._is_valid_stem(c))
+            score += valid_count * 30
+            # 2. Compound Pattern Bonus (The Fix for gaRapataye)
+            if len(components) >= 2:
+                left = components[0]
+                right = components[-1]
+                # Check Left against Initials
+                if left in self.COMPOUND_INITIALS:
+                    score += 15
+                # Check Right against Finals
+                # Need to extract stem to match (pataye -> pati)
+                for final in self.COMPOUND_FINALS:
+                    if right.startswith(final) or right == final:
+                        score += 25  # High bonus for matching pattern like 'pati'
+                        break
+                    # Try stripping vibhakti
+                    if right.endswith('aye') and right[:-3] + 'i' == final:
+                        score += 25
+                        break
+                    if right.endswith('ave') and right[:-3] + 'u' == final:
+                        score += 25
+                        break
+                # Balance bonus
+                if abs(len(left) - len(right)) <= 1:
+                    score += 10
+            # 4. Expansion penalty (sandhi artifacts add characters)
+            # Allow 1 char expansion for sandhi (e → a+I), only penalize 2+ extra chars
             total_len = sum(len(c) for c in components)
+            expansion = total_len - len(word)
+            if expansion > 1:
+                score -= (expansion - 1) * 25  # Stronger penalty
+            elif expansion == 0:
+                score += 20  # Bonus for exact-length splits (no sandhi artifact)
+            return score
         best_split = max(all_splits, key=score_split)