ArthaLabs commited on
Commit
e1cd59b
·
verified ·
1 Parent(s): 1f38592

Upload src/splitter.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/splitter.py +64 -119
src/splitter.py CHANGED
@@ -29,17 +29,20 @@ class SamasaSplitter:
29
 
30
  # Common compound final elements (uttarapada patterns)
31
  COMPOUND_FINALS = [
32
- "kara", "kAra", "kArin", "kft", "kftya", # Doer
33
- "gata", "gati", "gamana", # Going
34
- "ja", "jAta", "janman", # Born
35
- "Da", "DAra", "DAraka", "DArin", # Holding
36
- "maya", "mat", "vat", # Having/made of
37
- "pati", "nATa", "ISvara", "adhipa", # Lord
38
- "Atman", "rUpa", "svarUpa", # Self/form
39
- "pada", "pAduka", # Foot/step
40
- "stha", "sthita", "sthAna", # Standing/place
41
- "yukta", "hIna", "rahita", # With/without
42
- "priya", "rata", "ASrita", # Loving/devoted
 
 
 
43
  ]
44
 
45
  # Common compound first elements (pūrvapada patterns)
@@ -49,13 +52,13 @@ class SamasaSplitter:
49
  "deva", "brahma", "Atma", "para", # Divine/supreme
50
  "rAja", "mahI", "loka", # King/earth/world
51
  "hfd", "manas", "citta", # Heart/mind
52
- "padma", "kamala", # Lotus
53
  ]
54
 
55
  # Hardcoded protection for high-frequency words that might be over-split
56
  COMMON_WORDS = {
57
  "namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
58
- "pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya",
59
  }
60
 
61
  def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
@@ -613,112 +616,54 @@ class SamasaSplitter:
613
 
614
  return candidates
615
 
616
- def _score_split(self, left: str, right: str) -> float:
617
- """
618
- Score a potential split point. Lower is better.
619
- Critically tuned to avoid over-segmentation like 'padma' -> 'pad' + 'ma'
620
- """
621
- score = 0.0
622
-
623
- # PENALIZE SHORT COMPONENTS
624
- # Critical tuning:
625
- # < 3 chars (1, 2) -> Heavy penalty (prevent 'ma', 'ka', 'sa')
626
- # == 3 chars -> Slight penalty (allow 'hfd', 'gam', 'vid' but prefer longer)
627
- if len(left) < 3: score += 5.0
628
- elif len(left) == 3: score += 1.0
629
-
630
- if len(right) < 3: score += 5.0
631
- elif len(right) == 3: score += 1.0
632
-
633
- # PREFER LONGER LEFT COMPONENT (Greedy Match)
634
- # Previously we subtracted total len which was constant.
635
- # Now we reward taking a bigger bite from the left.
636
- # Increased to 1.0 to strongly prefer longer valid stems and overwhelm false matches
637
- score -= len(left) * 1.0
638
-
639
- # Prefer balanced splits (secondary factor)
640
- # Reduced influence to let greedy match dominate
641
- len_diff = abs(len(left) - len(right))
642
- score += len_diff * 0.02
643
-
644
- # Verify strict Kosha existence
645
- left_valid = self.analyzer._in_kosha(left)
646
- # Sandhi normalization for left: if ends with long vowel, try short
647
- if not left_valid and left.endswith('A'):
648
- if self.analyzer._in_kosha(left[:-1] + 'a'):
649
- left_valid = True
650
- if not left_valid and left.endswith('I'):
651
- if self.analyzer._in_kosha(left[:-1] + 'i'):
652
- left_valid = True
653
- if not left_valid and left.endswith('U'):
654
- if self.analyzer._in_kosha(left[:-1] + 'u'):
655
- left_valid = True
656
-
657
- right_valid = self.analyzer._in_kosha(right)
658
-
659
- # Recursive Lookahead for Right side scoring
660
- # If right matches a prefix, consider it valid (don't penalize)
661
- if not right_valid and len(right) > 3:
662
- for j in range(3, min(len(right), 15)):
663
- prefix = right[:j]
664
- if self.analyzer._in_kosha(prefix):
665
- right_valid = True
666
- break
667
- # Sandhi normalization: if prefix ends with long vowel, try short
668
- if prefix.endswith('A'):
669
- normalized = prefix[:-1] + 'a'
670
- if self.analyzer._in_kosha(normalized):
671
- right_valid = True
672
- break
673
- elif prefix.endswith('I'):
674
- normalized = prefix[:-1] + 'i'
675
- if self.analyzer._in_kosha(normalized):
676
- right_valid = True
677
- break
678
- elif prefix.endswith('U'):
679
- normalized = prefix[:-1] + 'u'
680
- if self.analyzer._in_kosha(normalized):
681
- right_valid = True
682
- break
683
-
684
- # Sandhi vowel restoration for right side
685
- # If left ends with long vowel & right starts with consonant,
686
- # try prepending the absorbed vowel
687
- if not right_valid and len(right) > 2:
688
- if left.endswith('A') and right[0] not in 'aAiIuUeEoO':
689
- restored = 'A' + right
690
- if self.analyzer._in_kosha(restored):
691
- right_valid = True
692
- elif len(restored) > 3:
693
- for j in range(3, min(len(restored), 12)):
694
- if self.analyzer._in_kosha(restored[:j]):
695
- right_valid = True
696
- break
697
- elif left.endswith('I') and right[0] not in 'aAiIuUeEoO':
698
- restored = 'I' + right
699
- if self.analyzer._in_kosha(restored):
700
- right_valid = True
701
- elif left.endswith('U') and right[0] not in 'aAiIuUeEoO':
702
- restored = 'U' + right
703
- if self.analyzer._in_kosha(restored):
704
- right_valid = True
705
-
706
- # If components are NOT in cache, heavily penalize
707
- if not left_valid: score += 10.0
708
- if not right_valid: score += 10.0
709
-
710
- # Bonus for known compound patterns
711
- for final in self.COMPOUND_FINALS:
712
- if right.startswith(final) or right == final:
713
- score -= 2.0 # Stronger bonus
714
- break
715
-
716
- for initial in self.COMPOUND_INITIALS:
717
- if left == initial or left.startswith(initial):
718
- score -= 2.0 # Stronger bonus
719
- break
720
-
721
- return score
722
 
723
  def split(self, word: str, max_components: int = 4) -> CompoundSplit:
724
  """
 
29
 
30
  # Common compound final elements (uttarapada patterns)
31
  COMPOUND_FINALS = [
32
+ "kara", "kAra", "kArin", "kft", "kftya",
33
+ "gata", "gati", "gamana",
34
+ "ja", "jAta", "janman",
35
+ "Da", "DAra", "DAraka", "DArin",
36
+ "maya", "mat", "vat",
37
+ "pati", "nATa", "ISvara", "adhipa",
38
+ "Atman", "rUpa", "svarUpa",
39
+ "pada", "pAduka",
40
+ "stha", "sthita", "sthAna",
41
+ "yukta", "hIna", "rahita",
42
+ "priya", "rata", "ASrita",
43
+ "vid", "jYa", "vadin", "pAla",
44
+ "rAja", "indra", "deva", "loka",
45
+ "karziR", "AkarziRi","ISa", # Loving/devoted
46
  ]
47
 
48
  # Common compound first elements (pūrvapada patterns)
 
52
  "deva", "brahma", "Atma", "para", # Divine/supreme
53
  "rAja", "mahI", "loka", # King/earth/world
54
  "hfd", "manas", "citta", # Heart/mind
55
+ "padma", "kamala", "Ananda", "ISa", # Lotus
56
  ]
57
 
58
  # Hardcoded protection for high-frequency words that might be over-split
59
  COMMON_WORDS = {
60
  "namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
61
+ "pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya","AkarziRi",
62
  }
63
 
64
  def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
 
616
 
617
  return candidates
618
 
619
+ def score_split(components):
620
+ # Base: Squared length favors fewer, longer components
621
+ score = sum(len(c)**2 for c in components)
622
+
623
+ # --- PENALTIES ---
624
+ for c in components:
625
+ if len(c) < 4:
626
+ if not self._is_valid_stem(c):
627
+ score -= 50
628
+ else:
629
+ score -= 5
630
+
631
+ if len(components) > 2:
632
+ score -= (len(components) - 2) * 20
633
+
634
+ # --- BONUSES ---
635
+
636
+ # 1. VALIDITY BONUS (THE FIX)
637
+ # Old value: 30. New value: 100.
638
+ # This ensures that 164 (split score) + 200 (bonus) > 289 (garbage score)
639
+ valid_count = sum(1 for c in components if self._is_valid_stem(c))
640
+ score += valid_count * 100
641
+
642
+ # 2. SURVIVAL BONUS (Protects rAmo, namaH)
643
+ if len(components) == 1:
644
+ if self._is_valid_stem(components[0]):
645
+ score += 50
646
+
647
+ # 3. Compound Pattern Bonus
648
+ if len(components) >= 2:
649
+ left = components[0]
650
+ right = components[-1]
651
+
652
+ if left in self.COMPOUND_INITIALS: score += 15
653
+
654
+ # Check Right Final
655
+ r_stem, _ = self.analyzer._extract_vibhakti(right)
656
+ if r_stem in self.COMPOUND_FINALS: score += 25
657
+ elif right in self.COMPOUND_FINALS: score += 25
658
+
659
+ if abs(len(left) - len(right)) <= 1: score += 10
660
+ # 4. Expansion penalty (RELAXED)
661
+ # We removed the "elif expansion == 0: score += 20" trap.
662
+ total_len = sum(len(c) for c in components)
663
+ expansion = total_len - len(word)
664
+ if expansion > 1:
665
+ score -= (expansion - 1) * 25
666
+ return score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
667
 
668
  def split(self, word: str, max_components: int = 4) -> CompoundSplit:
669
  """