Upload src/splitter.py with huggingface_hub
Browse files- src/splitter.py +64 -119
src/splitter.py
CHANGED
|
@@ -29,17 +29,20 @@ class SamasaSplitter:
|
|
| 29 |
|
| 30 |
# Common compound final elements (uttarapada patterns)
|
| 31 |
COMPOUND_FINALS = [
|
| 32 |
-
"kara", "kAra", "kArin", "kft", "kftya",
|
| 33 |
-
"gata", "gati", "gamana",
|
| 34 |
-
"ja", "jAta", "janman",
|
| 35 |
-
"Da", "DAra", "DAraka", "DArin",
|
| 36 |
-
"maya", "mat", "vat",
|
| 37 |
-
"pati", "nATa", "ISvara", "adhipa",
|
| 38 |
-
"Atman", "rUpa", "svarUpa",
|
| 39 |
-
"pada", "pAduka",
|
| 40 |
-
"stha", "sthita", "sthAna",
|
| 41 |
-
"yukta", "hIna", "rahita",
|
| 42 |
-
"priya", "rata", "ASrita",
|
|
|
|
|
|
|
|
|
|
| 43 |
]
|
| 44 |
|
| 45 |
# Common compound first elements (pūrvapada patterns)
|
|
@@ -49,13 +52,13 @@ class SamasaSplitter:
|
|
| 49 |
"deva", "brahma", "Atma", "para", # Divine/supreme
|
| 50 |
"rAja", "mahI", "loka", # King/earth/world
|
| 51 |
"hfd", "manas", "citta", # Heart/mind
|
| 52 |
-
"padma", "kamala",
|
| 53 |
]
|
| 54 |
|
| 55 |
# Hardcoded protection for high-frequency words that might be over-split
|
| 56 |
COMMON_WORDS = {
|
| 57 |
"namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
|
| 58 |
-
"pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya",
|
| 59 |
}
|
| 60 |
|
| 61 |
def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
|
|
@@ -613,112 +616,54 @@ class SamasaSplitter:
|
|
| 613 |
|
| 614 |
return candidates
|
| 615 |
|
| 616 |
-
def
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
if self.analyzer._in_kosha(prefix):
|
| 665 |
-
right_valid = True
|
| 666 |
-
break
|
| 667 |
-
# Sandhi normalization: if prefix ends with long vowel, try short
|
| 668 |
-
if prefix.endswith('A'):
|
| 669 |
-
normalized = prefix[:-1] + 'a'
|
| 670 |
-
if self.analyzer._in_kosha(normalized):
|
| 671 |
-
right_valid = True
|
| 672 |
-
break
|
| 673 |
-
elif prefix.endswith('I'):
|
| 674 |
-
normalized = prefix[:-1] + 'i'
|
| 675 |
-
if self.analyzer._in_kosha(normalized):
|
| 676 |
-
right_valid = True
|
| 677 |
-
break
|
| 678 |
-
elif prefix.endswith('U'):
|
| 679 |
-
normalized = prefix[:-1] + 'u'
|
| 680 |
-
if self.analyzer._in_kosha(normalized):
|
| 681 |
-
right_valid = True
|
| 682 |
-
break
|
| 683 |
-
|
| 684 |
-
# Sandhi vowel restoration for right side
|
| 685 |
-
# If left ends with long vowel & right starts with consonant,
|
| 686 |
-
# try prepending the absorbed vowel
|
| 687 |
-
if not right_valid and len(right) > 2:
|
| 688 |
-
if left.endswith('A') and right[0] not in 'aAiIuUeEoO':
|
| 689 |
-
restored = 'A' + right
|
| 690 |
-
if self.analyzer._in_kosha(restored):
|
| 691 |
-
right_valid = True
|
| 692 |
-
elif len(restored) > 3:
|
| 693 |
-
for j in range(3, min(len(restored), 12)):
|
| 694 |
-
if self.analyzer._in_kosha(restored[:j]):
|
| 695 |
-
right_valid = True
|
| 696 |
-
break
|
| 697 |
-
elif left.endswith('I') and right[0] not in 'aAiIuUeEoO':
|
| 698 |
-
restored = 'I' + right
|
| 699 |
-
if self.analyzer._in_kosha(restored):
|
| 700 |
-
right_valid = True
|
| 701 |
-
elif left.endswith('U') and right[0] not in 'aAiIuUeEoO':
|
| 702 |
-
restored = 'U' + right
|
| 703 |
-
if self.analyzer._in_kosha(restored):
|
| 704 |
-
right_valid = True
|
| 705 |
-
|
| 706 |
-
# If components are NOT in cache, heavily penalize
|
| 707 |
-
if not left_valid: score += 10.0
|
| 708 |
-
if not right_valid: score += 10.0
|
| 709 |
-
|
| 710 |
-
# Bonus for known compound patterns
|
| 711 |
-
for final in self.COMPOUND_FINALS:
|
| 712 |
-
if right.startswith(final) or right == final:
|
| 713 |
-
score -= 2.0 # Stronger bonus
|
| 714 |
-
break
|
| 715 |
-
|
| 716 |
-
for initial in self.COMPOUND_INITIALS:
|
| 717 |
-
if left == initial or left.startswith(initial):
|
| 718 |
-
score -= 2.0 # Stronger bonus
|
| 719 |
-
break
|
| 720 |
-
|
| 721 |
-
return score
|
| 722 |
|
| 723 |
def split(self, word: str, max_components: int = 4) -> CompoundSplit:
|
| 724 |
"""
|
|
|
|
| 29 |
|
| 30 |
# Common compound final elements (uttarapada patterns)
|
| 31 |
COMPOUND_FINALS = [
|
| 32 |
+
"kara", "kAra", "kArin", "kft", "kftya",
|
| 33 |
+
"gata", "gati", "gamana",
|
| 34 |
+
"ja", "jAta", "janman",
|
| 35 |
+
"Da", "DAra", "DAraka", "DArin",
|
| 36 |
+
"maya", "mat", "vat",
|
| 37 |
+
"pati", "nATa", "ISvara", "adhipa",
|
| 38 |
+
"Atman", "rUpa", "svarUpa",
|
| 39 |
+
"pada", "pAduka",
|
| 40 |
+
"stha", "sthita", "sthAna",
|
| 41 |
+
"yukta", "hIna", "rahita",
|
| 42 |
+
"priya", "rata", "ASrita",
|
| 43 |
+
"vid", "jYa", "vadin", "pAla",
|
| 44 |
+
"rAja", "indra", "deva", "loka",
|
| 45 |
+
"karziR", "AkarziRi","ISa", # Loving/devoted
|
| 46 |
]
|
| 47 |
|
| 48 |
# Common compound first elements (pūrvapada patterns)
|
|
|
|
| 52 |
"deva", "brahma", "Atma", "para", # Divine/supreme
|
| 53 |
"rAja", "mahI", "loka", # King/earth/world
|
| 54 |
"hfd", "manas", "citta", # Heart/mind
|
| 55 |
+
"padma", "kamala", "Ananda", "ISa", # Lotus
|
| 56 |
]
|
| 57 |
|
| 58 |
# Hardcoded protection for high-frequency words that might be over-split
|
| 59 |
COMMON_WORDS = {
|
| 60 |
"namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
|
| 61 |
+
"pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya","AkarziRi",
|
| 62 |
}
|
| 63 |
|
| 64 |
def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
|
|
|
|
| 616 |
|
| 617 |
return candidates
|
| 618 |
|
| 619 |
+
def score_split(components):
|
| 620 |
+
# Base: Squared length favors fewer, longer components
|
| 621 |
+
score = sum(len(c)**2 for c in components)
|
| 622 |
+
|
| 623 |
+
# --- PENALTIES ---
|
| 624 |
+
for c in components:
|
| 625 |
+
if len(c) < 4:
|
| 626 |
+
if not self._is_valid_stem(c):
|
| 627 |
+
score -= 50
|
| 628 |
+
else:
|
| 629 |
+
score -= 5
|
| 630 |
+
|
| 631 |
+
if len(components) > 2:
|
| 632 |
+
score -= (len(components) - 2) * 20
|
| 633 |
+
|
| 634 |
+
# --- BONUSES ---
|
| 635 |
+
|
| 636 |
+
# 1. VALIDITY BONUS (THE FIX)
|
| 637 |
+
# Old value: 30. New value: 100.
|
| 638 |
+
# This ensures that 164 (split score) + 200 (bonus) > 289 (garbage score)
|
| 639 |
+
valid_count = sum(1 for c in components if self._is_valid_stem(c))
|
| 640 |
+
score += valid_count * 100
|
| 641 |
+
|
| 642 |
+
# 2. SURVIVAL BONUS (Protects rAmo, namaH)
|
| 643 |
+
if len(components) == 1:
|
| 644 |
+
if self._is_valid_stem(components[0]):
|
| 645 |
+
score += 50
|
| 646 |
+
|
| 647 |
+
# 3. Compound Pattern Bonus
|
| 648 |
+
if len(components) >= 2:
|
| 649 |
+
left = components[0]
|
| 650 |
+
right = components[-1]
|
| 651 |
+
|
| 652 |
+
if left in self.COMPOUND_INITIALS: score += 15
|
| 653 |
+
|
| 654 |
+
# Check Right Final
|
| 655 |
+
r_stem, _ = self.analyzer._extract_vibhakti(right)
|
| 656 |
+
if r_stem in self.COMPOUND_FINALS: score += 25
|
| 657 |
+
elif right in self.COMPOUND_FINALS: score += 25
|
| 658 |
+
|
| 659 |
+
if abs(len(left) - len(right)) <= 1: score += 10
|
| 660 |
+
# 4. Expansion penalty (RELAXED)
|
| 661 |
+
# We removed the "elif expansion == 0: score += 20" trap.
|
| 662 |
+
total_len = sum(len(c) for c in components)
|
| 663 |
+
expansion = total_len - len(word)
|
| 664 |
+
if expansion > 1:
|
| 665 |
+
score -= (expansion - 1) * 25
|
| 666 |
+
return score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 667 |
|
| 668 |
def split(self, word: str, max_components: int = 4) -> CompoundSplit:
|
| 669 |
"""
|