|
|
"""
|
|
|
Vidyut Morphological Analyzer
|
|
|
Provides deterministic morphological analysis using Vidyut Kosha.
|
|
|
"""
|
|
|
|
|
|
import os
|
|
|
import json
|
|
|
from typing import Dict, List, Optional, Set
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
|
|
|
VIDYUT_DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vidyut_data")
|
|
|
STEMS_FILE = os.path.join(os.path.dirname(__file__), "stems.json")
|
|
|
|
|
|
|
|
|
_STEM_CACHE: set = set()
|
|
|
_STEM_CACHE_LOADED = False
|
|
|
|
|
|
def _load_stem_cache():
|
|
|
"""Load stems from stems.json for fast lookup."""
|
|
|
global _STEM_CACHE, _STEM_CACHE_LOADED
|
|
|
if _STEM_CACHE_LOADED:
|
|
|
return
|
|
|
|
|
|
|
|
|
COMMON_STEMS = {
|
|
|
|
|
|
"rAma", "sItA", "kfzRa", "arjuna", "deva", "brahma", "Atma", "Atman",
|
|
|
"parama", "param", "para", "maha", "mahA", "rAja", "vana", "gfha",
|
|
|
"hfd", "padma", "gata", "gam", "gacC", "ti", "aH", "am", "jYa",
|
|
|
|
|
|
"bhedAbheda", "bheda", "abheda", "vibhAga", "yoga", "vicAra",
|
|
|
"sopAdhika", "pratyagAtman", "pratyag", "Atman", "AbhAsa", "bhAsa",
|
|
|
"kzetra", "kzetrajYa", "santoSa", "mokSa", "saMsAra", "jIva",
|
|
|
"brahman", "paramAtman", "pratyaya", "pramANa", "anumAna",
|
|
|
|
|
|
"sat", "asat", "cit", "Ananda", "satcitAnanda",
|
|
|
|
|
|
}
|
|
|
_STEM_CACHE.update(COMMON_STEMS)
|
|
|
|
|
|
|
|
|
if os.path.exists(STEMS_FILE):
|
|
|
try:
|
|
|
with open(STEMS_FILE, "r", encoding="utf-8") as f:
|
|
|
stems = json.load(f)
|
|
|
_STEM_CACHE.update(stems)
|
|
|
print(f" VidyutAnalyzer: Loaded {len(_STEM_CACHE)} stems from cache")
|
|
|
except Exception as e:
|
|
|
print(f" VidyutAnalyzer: Stem cache load failed ({e})")
|
|
|
|
|
|
_STEM_CACHE_LOADED = True
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
class MorphParse:
|
|
|
"""A single morphological parse of a word."""
|
|
|
surface: str
|
|
|
stem: str
|
|
|
root: Optional[str]
|
|
|
pratyaya: Optional[str]
|
|
|
vibhakti: Optional[str]
|
|
|
upasarga: Optional[str]
|
|
|
is_compound: bool
|
|
|
is_verb: bool
|
|
|
derivation_depth: int
|
|
|
kosha_validated: bool
|
|
|
|
|
|
def token_form(self) -> str:
|
|
|
"""Return the canonical token form (stem without vibhakti)."""
|
|
|
if self.vibhakti and self.surface.endswith(self.vibhakti):
|
|
|
return self.surface[:-len(self.vibhakti)]
|
|
|
return self.stem if self.stem else self.surface
|
|
|
|
|
|
|
|
|
class VidyutAnalyzer:
|
|
|
"""
|
|
|
Morphological analyzer using Vidyut Kosha.
|
|
|
Provides deterministic disambiguation for tokenization.
|
|
|
"""
|
|
|
|
|
|
|
|
|
VIBHAKTI_ENDINGS = [
|
|
|
|
|
|
("asya", "Gen.Sg"), ("Aya", "Dat.Sg"), ("At", "Abl.Sg"),
|
|
|
("ena", "Ins.Sg"), ("e", "Loc.Sg"), ("aH", "Nom.Sg"),
|
|
|
("am", "Acc.Sg"), ("O", "Nom.Du"), ("ayoH", "Gen.Du"),
|
|
|
("ABym", "Ins.Du"), ("AH", "Nom.Pl"), ("An", "Gen.Pl"),
|
|
|
("eByo", "Dat.Pl"), ("EH", "Ins.Pl"), ("ezu", "Loc.Pl"),
|
|
|
|
|
|
("AyAH", "Gen.Sg.F"), ("AyAm", "Loc.Sg.F"), ("ayA", "Ins.Sg.F"),
|
|
|
|
|
|
("Ani", "Nom.Pl.N"), ("AnAm", "Gen.Pl.N"),
|
|
|
|
|
|
("sya", "Gen"), ("ya", "Dat"), ("ya", "Loc"),
|
|
|
("m", "Acc"), ("H", "Nom.Sg"),
|
|
|
]
|
|
|
|
|
|
|
|
|
KRT_SUFFIXES = [
|
|
|
("tvA", "ktvā"),
|
|
|
("ya", "lyap"),
|
|
|
("ta", "kta"),
|
|
|
("tavat", "ktavat"),
|
|
|
("at", "śatṛ"),
|
|
|
("Ana", "śānac"),
|
|
|
("tum", "tumun"),
|
|
|
("ti", "ktin"),
|
|
|
("ana", "lyuṭ"),
|
|
|
("aka", "ṇvul"),
|
|
|
("in", "ṇini"),
|
|
|
("tṛ", "tṛc"),
|
|
|
]
|
|
|
|
|
|
|
|
|
TADDHITA_SUFFIXES = [
|
|
|
("tva", "tva"),
|
|
|
("tA", "tal"),
|
|
|
("maya", "mayaṭ"),
|
|
|
("vat", "vatup"),
|
|
|
("mat", "matup"),
|
|
|
("ika", "ṭhak"),
|
|
|
("Iya", "cha"),
|
|
|
("ya", "yat"),
|
|
|
]
|
|
|
|
|
|
|
|
|
VERBAL_ENDINGS = [
|
|
|
|
|
|
"ti", "anti", "si", "Ta", "mi", "maH", "vas", "mas",
|
|
|
"te", "ante", "se", "Atte", "e", "mahi", "vahe", "mahe",
|
|
|
|
|
|
"anto", "antaH", "antam", "antI", "antau",
|
|
|
"ayanto", "ayantaH", "ayantam",
|
|
|
"mAnaH", "mAnam", "mAnA",
|
|
|
"taH", "tam", "te", "tAni",
|
|
|
"tavAn", "tavatI", "tavat",
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
UPASARGAS = [
|
|
|
"pra", "parA", "apa", "sam", "anu", "ava", "nis", "nir", "dus", "dur",
|
|
|
"vi", "A", "ni", "aDi", "api", "ati", "su", "ut", "ud", "aBi", "prati",
|
|
|
"pari", "upa",
|
|
|
]
|
|
|
|
|
|
def __init__(self, preload_cache: bool = True):
|
|
|
"""Initialize analyzer with fast stem cache."""
|
|
|
self._parse_cache: Dict[str, List[MorphParse]] = {}
|
|
|
|
|
|
|
|
|
_load_stem_cache()
|
|
|
|
|
|
def _in_kosha(self, word: str) -> bool:
|
|
|
"""Check if word exists in stem cache (O(1) lookup)."""
|
|
|
return word in _STEM_CACHE
|
|
|
|
|
|
def _is_verb_form(self, word: str) -> bool:
|
|
|
"""
|
|
|
Check if word is a verb form (tiṅanta/kṛdanta) that should be atomic.
|
|
|
Rule 3: Verbal forms = single token, no SP, no splitting.
|
|
|
"""
|
|
|
|
|
|
for ending in sorted(self.VERBAL_ENDINGS, key=len, reverse=True):
|
|
|
if word.endswith(ending) and len(word) > len(ending) + 2:
|
|
|
|
|
|
remainder = word[:-len(ending)]
|
|
|
|
|
|
if len(remainder) >= 2:
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
def _extract_vibhakti(self, word: str) -> tuple:
|
|
|
"""Extract vibhakti ending from a word. Returns (stem, vibhakti)."""
|
|
|
for ending, _ in sorted(self.VIBHAKTI_ENDINGS, key=lambda x: -len(x[0])):
|
|
|
if word.endswith(ending) and len(word) > len(ending) + 1:
|
|
|
stem = word[:-len(ending)]
|
|
|
|
|
|
for suffix in ["", "a", "A", "i", "I", "u", "U"]:
|
|
|
test = stem + suffix
|
|
|
if self._in_kosha(test):
|
|
|
return (test, ending)
|
|
|
|
|
|
return (stem, ending)
|
|
|
return (word, None)
|
|
|
|
|
|
def _extract_upasarga(self, word: str) -> tuple:
|
|
|
"""Extract upasarga prefix. Returns (upasarga, remainder)."""
|
|
|
for upa in sorted(self.UPASARGAS, key=len, reverse=True):
|
|
|
if word.startswith(upa) and len(word) > len(upa) + 2:
|
|
|
remainder = word[len(upa):]
|
|
|
|
|
|
|
|
|
if self._in_kosha(remainder):
|
|
|
return (upa, remainder)
|
|
|
|
|
|
for j in range(3, min(len(remainder), 10)):
|
|
|
if self._in_kosha(remainder[:j]):
|
|
|
return (upa, remainder)
|
|
|
return (None, word)
|
|
|
|
|
|
def _extract_pratyaya(self, word: str) -> tuple:
|
|
|
"""Extract kṛt/taddhita suffix. Returns (stem, pratyaya_type)."""
|
|
|
|
|
|
for suffix, ptype in sorted(self.KRT_SUFFIXES, key=lambda x: -len(x[0])):
|
|
|
if word.endswith(suffix) and len(word) > len(suffix) + 1:
|
|
|
stem = word[:-len(suffix)]
|
|
|
if self._in_kosha(stem) or len(stem) >= 2:
|
|
|
return (stem, ptype)
|
|
|
|
|
|
|
|
|
for suffix, ptype in sorted(self.TADDHITA_SUFFIXES, key=lambda x: -len(x[0])):
|
|
|
if word.endswith(suffix) and len(word) > len(suffix) + 1:
|
|
|
stem = word[:-len(suffix)]
|
|
|
if self._in_kosha(stem) or len(stem) >= 2:
|
|
|
return (stem, ptype)
|
|
|
|
|
|
return (word, None)
|
|
|
|
|
|
def analyze(self, word: str) -> List[MorphParse]:
|
|
|
"""
|
|
|
Analyze a word and return all possible parses.
|
|
|
Parses are sorted by preference (deterministic order).
|
|
|
"""
|
|
|
if not word or len(word) < 2:
|
|
|
return [MorphParse(
|
|
|
surface=word, stem=word, root=None, pratyaya=None,
|
|
|
vibhakti=None, upasarga=None, is_compound=False,
|
|
|
is_verb=False, derivation_depth=0, kosha_validated=False
|
|
|
)]
|
|
|
|
|
|
if word in self._parse_cache:
|
|
|
return self._parse_cache[word]
|
|
|
|
|
|
parses = []
|
|
|
|
|
|
|
|
|
|
|
|
if self._is_verb_form(word):
|
|
|
parses.append(MorphParse(
|
|
|
surface=word, stem=word, root=None, pratyaya=None,
|
|
|
vibhakti=None, upasarga=None, is_compound=False,
|
|
|
is_verb=True, derivation_depth=0, kosha_validated=True
|
|
|
))
|
|
|
|
|
|
self._parse_cache[word] = parses
|
|
|
return parses
|
|
|
|
|
|
|
|
|
if self._in_kosha(word):
|
|
|
parses.append(MorphParse(
|
|
|
surface=word, stem=word, root=None, pratyaya=None,
|
|
|
vibhakti=None, upasarga=None, is_compound=False,
|
|
|
is_verb=False, derivation_depth=0, kosha_validated=True
|
|
|
))
|
|
|
|
|
|
|
|
|
stem, vibhakti = self._extract_vibhakti(word)
|
|
|
if vibhakti:
|
|
|
parses.append(MorphParse(
|
|
|
surface=word, stem=stem, root=None, pratyaya=None,
|
|
|
vibhakti=vibhakti, upasarga=None, is_compound=False,
|
|
|
is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(stem)
|
|
|
))
|
|
|
|
|
|
|
|
|
upasarga, remainder = self._extract_upasarga(word)
|
|
|
if upasarga:
|
|
|
parses.append(MorphParse(
|
|
|
surface=word, stem=remainder, root=None, pratyaya=None,
|
|
|
vibhakti=None, upasarga=upasarga, is_compound=False,
|
|
|
is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(remainder)
|
|
|
))
|
|
|
|
|
|
|
|
|
prat_stem, pratyaya = self._extract_pratyaya(word)
|
|
|
if pratyaya:
|
|
|
parses.append(MorphParse(
|
|
|
surface=word, stem=prat_stem, root=prat_stem, pratyaya=pratyaya,
|
|
|
vibhakti=None, upasarga=None, is_compound=False,
|
|
|
is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(prat_stem)
|
|
|
))
|
|
|
|
|
|
|
|
|
if not parses:
|
|
|
parses.append(MorphParse(
|
|
|
surface=word, stem=word, root=None, pratyaya=None,
|
|
|
vibhakti=None, upasarga=None, is_compound=False,
|
|
|
is_verb=False, derivation_depth=0, kosha_validated=False
|
|
|
))
|
|
|
|
|
|
|
|
|
parses = self._disambiguate(parses)
|
|
|
|
|
|
self._parse_cache[word] = parses
|
|
|
return parses
|
|
|
|
|
|
def _disambiguate(self, parses: List[MorphParse]) -> List[MorphParse]:
|
|
|
"""
|
|
|
Deterministic disambiguation. NO randomness, NO frequency.
|
|
|
|
|
|
Priority:
|
|
|
1. Prefer fewer derivational splits
|
|
|
2. Prefer Kosha-validated stems
|
|
|
3. Prefer non-compound over compound
|
|
|
"""
|
|
|
def sort_key(p: MorphParse) -> tuple:
|
|
|
return (
|
|
|
p.derivation_depth,
|
|
|
0 if p.kosha_validated else 1,
|
|
|
1 if p.is_compound else 0,
|
|
|
)
|
|
|
|
|
|
return sorted(parses, key=sort_key)
|
|
|
|
|
|
def get_best_parse(self, word: str) -> MorphParse:
|
|
|
"""Get the single best (deterministic) parse for a word."""
|
|
|
parses = self.analyze(word)
|
|
|
return parses[0] if parses else MorphParse(
|
|
|
surface=word, stem=word, root=None, pratyaya=None,
|
|
|
vibhakti=None, upasarga=None, is_compound=False,
|
|
|
is_verb=False, derivation_depth=0, kosha_validated=False
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
print("Testing VidyutAnalyzer...")
|
|
|
analyzer = VidyutAnalyzer(preload_cache=True)
|
|
|
|
|
|
test_words = [
|
|
|
"rAmaH", "gacCati", "paramAtma", "hfdpadmagataM",
|
|
|
"sopAdhika", "bhAva", "abheda", "vicAraH"
|
|
|
]
|
|
|
|
|
|
for word in test_words:
|
|
|
parse = analyzer.get_best_parse(word)
|
|
|
print(f" {word:20} → stem: {parse.stem:15} vibhakti: {parse.vibhakti or '-':8} kosha: {parse.kosha_validated}")
|
|
|
|