Spaces:
Sleeping
Sleeping
| """ | |
| Vidyut Morphological Analyzer | |
| Provides deterministic morphological analysis using Vidyut Kosha. | |
| """ | |
| import os | |
| import json | |
| from typing import Dict, List, Optional, Set | |
| from dataclasses import dataclass | |
| # --- CONFIGURATION --- | |
| VIDYUT_DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vidyut_data") | |
| STEMS_FILE = os.path.join(os.path.dirname(__file__), "stems.json") | |
| # --- FAST STEM CACHE (no Kosha disk I/O during tokenization) --- | |
| _STEM_CACHE: set = set() | |
| _STEM_CACHE_LOADED = False | |
| def _load_stem_cache(): | |
| """Load stems from stems.json for fast lookup.""" | |
| global _STEM_CACHE, _STEM_CACHE_LOADED | |
| if _STEM_CACHE_LOADED: | |
| return | |
| # Common Sanskrit stems (hardcoded for immediate use) | |
| COMMON_STEMS = { | |
| # Basic nouns | |
| "rAma", "sItA", "kfzRa", "arjuna", "deva", "brahma", "Atma", "Atman", | |
| "parama", "param", "para", "maha", "mahA", "rAja", "vana", "gfha", | |
| "hfd", "padma", "gata", "gam", "gacC", "ti", "aH", "am", "jYa", | |
| # Philosophical compounds | |
| "bhedAbheda", "bheda", "abheda", "vibhAga", "yoga", "vicAra", | |
| "sopAdhika", "pratyagAtman", "pratyag", "Atman", "AbhAsa", "bhAsa", | |
| "kzetra", "kzetrajYa", "santoSa", "mokSa", "saMsAra", "jIva", | |
| "brahman", "paramAtman", "pratyaya", "pramANa", "anumAna", | |
| # Joining elements | |
| "sat", "asat", "cit", "Ananda", "satcitAnanda", | |
| # NO CYBER-YOGI STEMS - those need to be discovered compositionally! | |
| } | |
| _STEM_CACHE.update(COMMON_STEMS) | |
| # Load from massive stems.json if available | |
| if os.path.exists(STEMS_FILE): | |
| try: | |
| with open(STEMS_FILE, "r", encoding="utf-8") as f: | |
| stems = json.load(f) | |
| _STEM_CACHE.update(stems) | |
| print(f" VidyutAnalyzer: Loaded {len(_STEM_CACHE)} stems from cache") | |
| except Exception as e: | |
| print(f" VidyutAnalyzer: Stem cache load failed ({e})") | |
| _STEM_CACHE_LOADED = True | |
| class MorphParse: | |
| """A single morphological parse of a word.""" | |
| surface: str # Original surface form | |
| stem: str # The stem/prātipadika | |
| root: Optional[str] # Dhātu if applicable | |
| pratyaya: Optional[str] # Suffix (kṛt/taddhita) | |
| vibhakti: Optional[str] # Case ending | |
| upasarga: Optional[str] # Prefix | |
| is_compound: bool # Is this a samāsa? | |
| is_verb: bool # Is this a tiṅanta? | |
| derivation_depth: int # Number of derivational steps | |
| kosha_validated: bool # Is the stem in Kosha? | |
| def token_form(self) -> str: | |
| """Return the canonical token form (stem without vibhakti).""" | |
| if self.vibhakti and self.surface.endswith(self.vibhakti): | |
| return self.surface[:-len(self.vibhakti)] | |
| return self.stem if self.stem else self.surface | |
| class VidyutAnalyzer: | |
| """ | |
| Morphological analyzer using Vidyut Kosha. | |
| Provides deterministic disambiguation for tokenization. | |
| """ | |
| # Nominal case endings (vibhakti markers) | |
| VIBHAKTI_ENDINGS = [ | |
| # Masculine a-stem | |
| ("asya", "Gen.Sg"), ("Aya", "Dat.Sg"), ("At", "Abl.Sg"), | |
| ("ena", "Ins.Sg"), ("e", "Loc.Sg"), ("aH", "Nom.Sg"), | |
| ("am", "Acc.Sg"), ("O", "Nom.Du"), ("ayoH", "Gen.Du"), | |
| ("ABym", "Ins.Du"), ("AH", "Nom.Pl"), ("An", "Gen.Pl"), | |
| ("eByo", "Dat.Pl"), ("EH", "Ins.Pl"), ("ezu", "Loc.Pl"), | |
| # Feminine ā-stem | |
| ("AyAH", "Gen.Sg.F"), ("AyAm", "Loc.Sg.F"), ("ayA", "Ins.Sg.F"), | |
| # Neuter | |
| ("Ani", "Nom.Pl.N"), ("AnAm", "Gen.Pl.N"), | |
| # Common short | |
| ("sya", "Gen"), ("ya", "Dat"), ("ya", "Loc"), | |
| ("m", "Acc"), ("H", "Nom.Sg"), | |
| ] | |
| # Kṛt pratyayas (verbal derivatives) | |
| KRT_SUFFIXES = [ | |
| ("tvA", "ktvā"), # Absolutive | |
| ("ya", "lyap"), # Absolutive with prefix | |
| ("ta", "kta"), # Past passive participle | |
| ("tavat", "ktavat"), # Past active participle | |
| ("at", "śatṛ"), # Present participle | |
| ("Ana", "śānac"), # Present participle (ātm) | |
| ("tum", "tumun"), # Infinitive | |
| ("ti", "ktin"), # Action noun | |
| ("ana", "lyuṭ"), # Action noun | |
| ("aka", "ṇvul"), # Agent noun | |
| ("in", "ṇini"), # Agent noun | |
| ("tṛ", "tṛc"), # Agent noun | |
| ] | |
| # Taddhita suffixes (nominal derivatives) | |
| TADDHITA_SUFFIXES = [ | |
| ("tva", "tva"), # Abstract noun -ness | |
| ("tA", "tal"), # Abstract noun -ness | |
| ("maya", "mayaṭ"), # Made of | |
| ("vat", "vatup"), # Having | |
| ("mat", "matup"), # Having | |
| ("ika", "ṭhak"), # Related to | |
| ("Iya", "cha"), # Related to | |
| ("ya", "yat"), # Fitness | |
| ] | |
| # Verbal form endings (tiṅanta + participles) - treat as atomic | |
| VERBAL_ENDINGS = [ | |
| # Finite verb endings (tiṅanta) | |
| "ti", "anti", "si", "Ta", "mi", "maH", "vas", "mas", | |
| "te", "ante", "se", "Atte", "e", "mahi", "vahe", "mahe", | |
| # Participial endings (kṛdanta declined) | |
| "anto", "antaH", "antam", "antI", "antau", # Present participle | |
| "ayanto", "ayantaH", "ayantam", # Causative participle | |
| "mAnaH", "mAnam", "mAnA", # Present/middle participle | |
| "taH", "tam", "te", "tAni", # Past participle (removed tA - causes false positive on abstract nouns) | |
| "tavAn", "tavatI", "tavat", # Past active participle | |
| # Removed: "ya", "yam", "yaH" - too many false positives on abstract nouns | |
| ] | |
| # Upasargas (verbal prefixes) | |
| UPASARGAS = [ | |
| "pra", "parA", "apa", "sam", "anu", "ava", "nis", "nir", "dus", "dur", | |
| "vi", "A", "ni", "aDi", "api", "ati", "su", "ut", "ud", "aBi", "prati", | |
| "pari", "upa", | |
| ] | |
| def __init__(self, preload_cache: bool = True): | |
| """Initialize analyzer with fast stem cache.""" | |
| self._parse_cache: Dict[str, List[MorphParse]] = {} | |
| # Load stem cache on init | |
| _load_stem_cache() | |
| def _in_kosha(self, word: str) -> bool: | |
| """Check if word exists in stem cache (O(1) lookup).""" | |
| return word in _STEM_CACHE | |
| def _is_verb_form(self, word: str) -> bool: | |
| """ | |
| Check if word is a verb form (tiṅanta/kṛdanta) that should be atomic. | |
| Rule 3: Verbal forms = single token, no SP, no splitting. | |
| """ | |
| # Sort by length (longest first) to avoid partial matches | |
| for ending in sorted(self.VERBAL_ENDINGS, key=len, reverse=True): | |
| if word.endswith(ending) and len(word) > len(ending) + 2: | |
| # Check if the remainder looks like a valid root/stem | |
| remainder = word[:-len(ending)] | |
| # Simple heuristic: if remainder is >= 2 chars, likely a verb form | |
| if len(remainder) >= 2: | |
| return True | |
| return False | |
| def _extract_vibhakti(self, word: str) -> tuple: | |
| """Extract vibhakti ending from a word. Returns (stem, vibhakti).""" | |
| for ending, _ in sorted(self.VIBHAKTI_ENDINGS, key=lambda x: -len(x[0])): | |
| if word.endswith(ending) and len(word) > len(ending) + 1: | |
| stem = word[:-len(ending)] | |
| # Validate stem exists | |
| for suffix in ["", "a", "A", "i", "I", "u", "U"]: | |
| test = stem + suffix | |
| if self._in_kosha(test): | |
| return (test, ending) | |
| # Return anyway with original stem | |
| return (stem, ending) | |
| return (word, None) | |
| def _extract_upasarga(self, word: str) -> tuple: | |
| """Extract upasarga prefix. Returns (upasarga, remainder).""" | |
| for upa in sorted(self.UPASARGAS, key=len, reverse=True): | |
| if word.startswith(upa) and len(word) > len(upa) + 2: | |
| remainder = word[len(upa):] | |
| # Strengthened validation: require Kosha match or valid prefix | |
| # Avoids false positives like pratyag → prati + junk | |
| if self._in_kosha(remainder): | |
| return (upa, remainder) | |
| # Also check if remainder starts with a valid stem | |
| for j in range(3, min(len(remainder), 10)): | |
| if self._in_kosha(remainder[:j]): | |
| return (upa, remainder) | |
| return (None, word) | |
| def _extract_pratyaya(self, word: str) -> tuple: | |
| """Extract kṛt/taddhita suffix. Returns (stem, pratyaya_type).""" | |
| # Try kṛt first | |
| for suffix, ptype in sorted(self.KRT_SUFFIXES, key=lambda x: -len(x[0])): | |
| if word.endswith(suffix) and len(word) > len(suffix) + 1: | |
| stem = word[:-len(suffix)] | |
| if self._in_kosha(stem) or len(stem) >= 2: | |
| return (stem, ptype) | |
| # Try taddhita | |
| for suffix, ptype in sorted(self.TADDHITA_SUFFIXES, key=lambda x: -len(x[0])): | |
| if word.endswith(suffix) and len(word) > len(suffix) + 1: | |
| stem = word[:-len(suffix)] | |
| if self._in_kosha(stem) or len(stem) >= 2: | |
| return (stem, ptype) | |
| return (word, None) | |
| def analyze(self, word: str) -> List[MorphParse]: | |
| """ | |
| Analyze a word and return all possible parses. | |
| Parses are sorted by preference (deterministic order). | |
| """ | |
| if not word or len(word) < 2: | |
| return [MorphParse( | |
| surface=word, stem=word, root=None, pratyaya=None, | |
| vibhakti=None, upasarga=None, is_compound=False, | |
| is_verb=False, derivation_depth=0, kosha_validated=False | |
| )] | |
| if word in self._parse_cache: | |
| return self._parse_cache[word] | |
| parses = [] | |
| # Parse 0: Verb form detection (Rule 3 - atomic verbs) | |
| # Check this FIRST so is_verb flag is set for downstream logic | |
| if self._is_verb_form(word): | |
| parses.append(MorphParse( | |
| surface=word, stem=word, root=None, pratyaya=None, | |
| vibhakti=None, upasarga=None, is_compound=False, | |
| is_verb=True, derivation_depth=0, kosha_validated=True | |
| )) | |
| # Return early - verb forms are atomic | |
| self._parse_cache[word] = parses | |
| return parses | |
| # Parse 1: Direct Kosha lookup (simplest) | |
| if self._in_kosha(word): | |
| parses.append(MorphParse( | |
| surface=word, stem=word, root=None, pratyaya=None, | |
| vibhakti=None, upasarga=None, is_compound=False, | |
| is_verb=False, derivation_depth=0, kosha_validated=True | |
| )) | |
| # Parse 2: Vibhakti extraction | |
| stem, vibhakti = self._extract_vibhakti(word) | |
| if vibhakti: | |
| parses.append(MorphParse( | |
| surface=word, stem=stem, root=None, pratyaya=None, | |
| vibhakti=vibhakti, upasarga=None, is_compound=False, | |
| is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(stem) | |
| )) | |
| # Parse 3: Upasarga + stem | |
| upasarga, remainder = self._extract_upasarga(word) | |
| if upasarga: | |
| parses.append(MorphParse( | |
| surface=word, stem=remainder, root=None, pratyaya=None, | |
| vibhakti=None, upasarga=upasarga, is_compound=False, | |
| is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(remainder) | |
| )) | |
| # Parse 4: Pratyaya extraction | |
| prat_stem, pratyaya = self._extract_pratyaya(word) | |
| if pratyaya: | |
| parses.append(MorphParse( | |
| surface=word, stem=prat_stem, root=prat_stem, pratyaya=pratyaya, | |
| vibhakti=None, upasarga=None, is_compound=False, | |
| is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(prat_stem) | |
| )) | |
| # Fallback: surface form as stem | |
| if not parses: | |
| parses.append(MorphParse( | |
| surface=word, stem=word, root=None, pratyaya=None, | |
| vibhakti=None, upasarga=None, is_compound=False, | |
| is_verb=False, derivation_depth=0, kosha_validated=False | |
| )) | |
| # Sort by preference (deterministic) | |
| parses = self._disambiguate(parses) | |
| self._parse_cache[word] = parses | |
| return parses | |
| def _disambiguate(self, parses: List[MorphParse]) -> List[MorphParse]: | |
| """ | |
| Deterministic disambiguation. NO randomness, NO frequency. | |
| Priority: | |
| 1. Prefer fewer derivational splits | |
| 2. Prefer Kosha-validated stems | |
| 3. Prefer non-compound over compound | |
| """ | |
| def sort_key(p: MorphParse) -> tuple: | |
| return ( | |
| p.derivation_depth, # Fewer splits first | |
| 0 if p.kosha_validated else 1, # Kosha-validated first | |
| 1 if p.is_compound else 0, # Non-compound first | |
| ) | |
| return sorted(parses, key=sort_key) | |
| def get_best_parse(self, word: str) -> MorphParse: | |
| """Get the single best (deterministic) parse for a word.""" | |
| parses = self.analyze(word) | |
| return parses[0] if parses else MorphParse( | |
| surface=word, stem=word, root=None, pratyaya=None, | |
| vibhakti=None, upasarga=None, is_compound=False, | |
| is_verb=False, derivation_depth=0, kosha_validated=False | |
| ) | |
| # --- TEST --- | |
| if __name__ == "__main__": | |
| print("Testing VidyutAnalyzer...") | |
| analyzer = VidyutAnalyzer(preload_cache=True) | |
| test_words = [ | |
| "rAmaH", "gacCati", "paramAtma", "hfdpadmagataM", | |
| "sopAdhika", "bhAva", "abheda", "vicAraH" | |
| ] | |
| for word in test_words: | |
| parse = analyzer.get_best_parse(word) | |
| print(f" {word:20} → stem: {parse.stem:15} vibhakti: {parse.vibhakti or '-':8} kosha: {parse.kosha_validated}") | |