ArthaLabs commited on
Commit
1f38592
·
verified ·
1 Parent(s): d3af4de

Upload src/splitter.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/splitter.py +104 -32
src/splitter.py CHANGED
@@ -52,6 +52,12 @@ class SamasaSplitter:
52
  "padma", "kamala", # Lotus
53
  ]
54
 
 
 
 
 
 
 
55
  def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
56
  """Initialize with optional shared analyzer."""
57
  self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
@@ -133,14 +139,35 @@ class SamasaSplitter:
133
  def _is_valid_stem(self, surface: str) -> bool:
134
  """
135
  Check if a surface form is a valid stem, trying:
 
136
  1. Direct Kosha lookup
137
- 2. Sandhi reversal
138
- 3. Pratyaya (suffix) stripping
 
139
  """
140
  if len(surface) < 2:
141
  return False
142
 
143
- # Try all Sandhi reversal candidates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  candidates = self._try_sandhi_reversal(surface)
145
  for candidate in candidates:
146
  if self.analyzer._in_kosha(candidate):
@@ -152,18 +179,18 @@ class SamasaSplitter:
152
  return True
153
  if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
154
  return True
155
-
156
- # Try VISARGA STRIPPING (vAlmIkiH vAlmIki)
157
- if surface.endswith('H') and len(surface) > 2:
158
- base = surface[:-1]
159
- if self.analyzer._in_kosha(base):
160
- return True
161
 
162
  # Try VIBHAKTI STRIPPING (nominal case endings)
163
  VIBHAKTI_ENDINGS = [
164
  'am', 'aH', 'ena', 'Aya', 'At', 'asya', 'e', 'AH', # Masculine a-stem
165
  'An', 'EH', 'eBya', 'AnAm', 'ezu', # Masculine a-stem plural
166
  'au', 'OH', 'AvyAm', # Dual
 
 
167
  ]
168
  for ending in sorted(VIBHAKTI_ENDINGS, key=len, reverse=True):
169
  if surface.endswith(ending) and len(surface) > len(ending) + 2:
@@ -173,6 +200,14 @@ class SamasaSplitter:
173
  # Try with 'a' restoration (munipuMgavam → munipuMgava)
174
  if self.analyzer._in_kosha(stem + 'a'):
175
  return True
 
 
 
 
 
 
 
 
176
 
177
  # Try PRATYAYA STRIPPING (grammatical suffix removal)
178
  # This is Panini's kRt/taddhita system - generalizes to ALL Sanskrit
@@ -802,38 +837,75 @@ class SamasaSplitter:
802
  # 1. Penalize short components (< 3 chars) heavily
803
  # 2. Prefer 2-component splits over 3+ components
804
  # 3. Single long tokens get moderate penalty
805
- # 4. Bonus for components directly in kosha (prefer mahA+rAja over maha+arAja)
806
  def score_split(components):
807
- base_score = sum(len(c)**2 for c in components)
 
808
 
809
- # Penalize short components (garbage like 'ma', 'at')
810
- short_penalty = sum(10 for c in components if len(c) < 3)
811
- base_score -= short_penalty * 5
 
 
 
 
 
812
 
813
- # Bonus for 2-component splits (optimal granularity)
 
 
 
 
814
  if len(components) == 2:
815
- base_score += 20
816
 
817
- # Penalty for single long tokens (prefer analysis)
818
- if len(components) == 1 and len(components[0]) > 6:
819
- base_score -= 15
 
820
 
821
- # Bonus for components directly in kosha (prefer clean stems)
822
- kosha_bonus = sum(25 for c in components if self.analyzer._in_kosha(c))
823
- base_score += kosha_bonus
 
824
 
825
- # Prefer balanced splits (similar length components)
826
- if len(components) == 2:
827
- len_diff = abs(len(components[0]) - len(components[1]))
828
- if len_diff <= 1:
829
- base_score += 10 # Bonus for balanced split
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
830
 
831
- # Penalize splits with expanded length (sandhi artifacts add characters)
 
832
  total_len = sum(len(c) for c in components)
833
- if total_len > len(word):
834
- base_score -= (total_len - len(word)) * 10 # Penalty per extra char
835
-
836
- return base_score
 
 
 
837
 
838
  best_split = max(all_splits, key=score_split)
839
 
 
52
  "padma", "kamala", # Lotus
53
  ]
54
 
55
+ # Hardcoded protection for high-frequency words that might be over-split
56
+ COMMON_WORDS = {
57
+ "namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
58
+ "pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya",
59
+ }
60
+
61
  def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
62
  """Initialize with optional shared analyzer."""
63
  self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
 
139
  def _is_valid_stem(self, surface: str) -> bool:
140
  """
141
  Check if a surface form is a valid stem, trying:
142
+ 0. COMMON_WORDS protection
143
  1. Direct Kosha lookup
144
+ 2. Visarga/Anusvara base check (rAmaH → rAma)
145
+ 3. Sandhi reversal
146
+ 4. Pratyaya (suffix) stripping
147
  """
148
  if len(surface) < 2:
149
  return False
150
 
151
+ # 0. Safety Check for Common Words (protect namaH, rAmo, etc.)
152
+ if surface in self.COMMON_WORDS:
153
+ return True
154
+
155
+ # 1. Direct Kosha Check
156
+ if self.analyzer._in_kosha(surface):
157
+ return True
158
+
159
+ # 2. Visarga/Anusvara Check (FIX for rAmaH validation)
160
+ # If sandhi-restored "rAmo" → "rAmaH", accept it if base "rAma" is in kosha
161
+ if surface.endswith('H') and len(surface) > 2:
162
+ base = surface[:-1]
163
+ if self.analyzer._in_kosha(base):
164
+ return True
165
+ if surface.endswith('M') and len(surface) > 2:
166
+ base = surface[:-1]
167
+ if self.analyzer._in_kosha(base):
168
+ return True
169
+
170
+ # 3. Try all Sandhi reversal candidates
171
  candidates = self._try_sandhi_reversal(surface)
172
  for candidate in candidates:
173
  if self.analyzer._in_kosha(candidate):
 
179
  return True
180
  if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
181
  return True
182
+ # Recursive visarga check for candidates too
183
+ if candidate.endswith('H') and len(candidate) > 2:
184
+ if self.analyzer._in_kosha(candidate[:-1]):
185
+ return True
 
 
186
 
187
  # Try VIBHAKTI STRIPPING (nominal case endings)
188
  VIBHAKTI_ENDINGS = [
189
  'am', 'aH', 'ena', 'Aya', 'At', 'asya', 'e', 'AH', # Masculine a-stem
190
  'An', 'EH', 'eBya', 'AnAm', 'ezu', # Masculine a-stem plural
191
  'au', 'OH', 'AvyAm', # Dual
192
+ 'aye', # i-stem dative (pataye, munaye)
193
+ 'ave', # u-stem dative (vizRave, gurave)
194
  ]
195
  for ending in sorted(VIBHAKTI_ENDINGS, key=len, reverse=True):
196
  if surface.endswith(ending) and len(surface) > len(ending) + 2:
 
200
  # Try with 'a' restoration (munipuMgavam → munipuMgava)
201
  if self.analyzer._in_kosha(stem + 'a'):
202
  return True
203
+
204
+ # SPECIAL CASE: 'aye' ending implies 'i' stem (pataye → pati)
205
+ if ending == 'aye' and self.analyzer._in_kosha(stem + 'i'):
206
+ return True
207
+
208
+ # SPECIAL CASE: 'ave' ending implies 'u' stem (gurave → guru)
209
+ if ending == 'ave' and self.analyzer._in_kosha(stem + 'u'):
210
+ return True
211
 
212
  # Try PRATYAYA STRIPPING (grammatical suffix removal)
213
  # This is Panini's kRt/taddhita system - generalizes to ALL Sanskrit
 
837
  # 1. Penalize short components (< 3 chars) heavily
838
  # 2. Prefer 2-component splits over 3+ components
839
  # 3. Single long tokens get moderate penalty
840
+ # V4 Scoring with Compound Pattern Recognition
841
  def score_split(components):
842
+ # Base: Squared length favors fewer, longer components
843
+ score = sum(len(c)**2 for c in components)
844
 
845
+ # --- PENALTIES ---
846
+ # 1. Short junk penalty (unless it's a valid stem)
847
+ for c in components:
848
+ if len(c) < 4:
849
+ if not self._is_valid_stem(c):
850
+ score -= 50 # Garbage fragment
851
+ else:
852
+ score -= 5 # Valid but short (e.g. 'ISa'), slight penalty
853
 
854
+ # 2. Fragmentation penalty
855
+ if len(components) > 2:
856
+ score -= (len(components) - 2) * 30 # Increased penalty
857
+
858
+ # 3. 2-component bonus (optimal compound structure)
859
  if len(components) == 2:
860
+ score += 25
861
 
862
+ # --- BONUSES ---
863
+ # 0. COMMON_WORDS Protection (namaH, rAmo should stay atomic)
864
+ if len(components) == 1 and components[0] in self.COMMON_WORDS:
865
+ score += 50 # Strong bonus to prevent splitting
866
 
867
+ # 1. Validity Bonus (Crucial for pataye/rAmo)
868
+ # Use _is_valid_stem so declined words get credit
869
+ valid_count = sum(1 for c in components if self._is_valid_stem(c))
870
+ score += valid_count * 30
871
 
872
+ # 2. Compound Pattern Bonus (The Fix for gaRapataye)
873
+ if len(components) >= 2:
874
+ left = components[0]
875
+ right = components[-1]
876
+
877
+ # Check Left against Initials
878
+ if left in self.COMPOUND_INITIALS:
879
+ score += 15
880
+
881
+ # Check Right against Finals
882
+ # Need to extract stem to match (pataye -> pati)
883
+ for final in self.COMPOUND_FINALS:
884
+ if right.startswith(final) or right == final:
885
+ score += 25 # High bonus for matching pattern like 'pati'
886
+ break
887
+ # Try stripping vibhakti
888
+ if right.endswith('aye') and right[:-3] + 'i' == final:
889
+ score += 25
890
+ break
891
+ if right.endswith('ave') and right[:-3] + 'u' == final:
892
+ score += 25
893
+ break
894
+
895
+ # Balance bonus
896
+ if abs(len(left) - len(right)) <= 1:
897
+ score += 10
898
 
899
+ # 4. Expansion penalty (sandhi artifacts add characters)
900
+ # Allow 1 char expansion for sandhi (e → a+I), only penalize 2+ extra chars
901
  total_len = sum(len(c) for c in components)
902
+ expansion = total_len - len(word)
903
+ if expansion > 1:
904
+ score -= (expansion - 1) * 25 # Stronger penalty
905
+ elif expansion == 0:
906
+ score += 20 # Bonus for exact-length splits (no sandhi artifact)
907
+
908
+ return score
909
 
910
  best_split = max(all_splits, key=score_split)
911