Upload folder using huggingface_hub
Browse files- README.md +76 -0
- special_tokens_map.json +9 -0
- src/__init__.py +19 -0
- src/analyzer.py +339 -0
- src/splitter.py +722 -0
- src/tokenizer.py +509 -0
- stems.json +0 -0
- tokenizer_config.json +14 -0
- tokenizer_hf.py +128 -0
- vocab.json +0 -0
README.md
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Panini Tokenizer
|
| 2 |
+
|
| 3 |
+
**The first grammar-first Sanskrit tokenizer based on Pāṇinian morphological analysis.**
|
| 4 |
+
|
| 5 |
+
## 🚨 The Problem
|
| 6 |
+
|
| 7 |
+
Statistical tokenizers (BPE/WordPiece) systematically underperform on Sanskrit because they do not model **Sandhi**(phonetic fusion).
|
| 8 |
+
|
| 9 |
+
* **Standard Models (BERT/Qwen):** fracture complex words into phonetic noise (`##k`, `##z`, `##ab`).
|
| 10 |
+
* **Panini Tokenizer:** uses recursive morphological parsing to recover the original **semantic roots** (`nirapekza` + `jYAna`).
|
| 11 |
+
|
| 12 |
+
## ⚡ Key Features
|
| 13 |
+
|
| 14 |
+
* 🔤 **Vocab:** 128k dictionary-backed tokens (Monier-Williams).
|
| 15 |
+
* 🔄 **Sandhi Reversal:** Automatically splits fused compounds (e.g., `t` → `d`, `i` → `y`).
|
| 16 |
+
* 🧩 **Semantic Atomicism:** Preserves complex philosophical concepts as single tokens. This aligns token boundaries with linguistic meaning, reducing gradient noise during training.
|
| 17 |
+
* 📉 **Efficiency:** Reduces token count by **2-4x** compared to multilingual models.
|
| 18 |
+
|
| 19 |
+
## 🚀 Quick Start
|
| 20 |
+
|
| 21 |
+
No custom installation required. Use directly with Hugging Face `transformers`:
|
| 22 |
+
**Note:** The model expects **SLP1 transliteration** (e.g., `vidyA`), not Devanagari.
|
| 23 |
+
```python
|
| 24 |
+
from transformers import AutoTokenizer
|
| 25 |
+
|
| 26 |
+
# Load with trust_remote_code=True because of custom logic
|
| 27 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 28 |
+
"ArthaLabs/panini-tokenizer",
|
| 29 |
+
trust_remote_code=True
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Tokenize complex Sandhi compounds (SLP1 input)
|
| 33 |
+
text = "nirapekzajYAnasAkzAtkArasAmarthyam"
|
| 34 |
+
tokens = tokenizer.tokenize(text)
|
| 35 |
+
|
| 36 |
+
print(tokens)
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## 📊 Benchmarks: The "Context Dividend"
|
| 40 |
+
|
| 41 |
+
By strictly adhering to grammar, Panini Tokenizer drastically reduces sequence length, effectively **tripling the context window** for downstream tasks.
|
| 42 |
+
|
| 43 |
+
| Input Compound | **Panini (Ours)** | Google MuRIL | Qwen2 |
|
| 44 |
+
| --- | --- | --- | --- |
|
| 45 |
+
| `nirapekzajYAnasAkzAtkArasAmarthyam` | **6** | 18 | 25 |
|
| 46 |
+
| `tadekaniScitArthavyavasthApanam` | **6** | 13 | 18 |
|
| 47 |
+
| `svaprakASatvaparaprakASavyavacCedaH` | **7** | 15 | 22 |
|
| 48 |
+
| `svAtantryAbhAvasamucchinnakartRtvanirAsaH` | **8** | 19 | 25 |
|
| 49 |
+
|
| 50 |
+
### Visual Comparison
|
| 51 |
+
|
| 52 |
+
**Input:** *Independent-knowledge-direct-realization-capacity*
|
| 53 |
+
|
| 54 |
+
* **Panini:** `▁nirapekza` | `jYAna` | `sAkzAtkAra` | `sAman` | `arthy` | `am` (6 meaningful roots)
|
| 55 |
+
* **Sanskrit-BERT:** `nirape` | `##k` | `##z` | `##a` | `##jya` | `##nas`... (14 noise fragments)
|
| 56 |
+
|
| 57 |
+
## 🛠️ Technical Details
|
| 58 |
+
|
| 59 |
+
* **Architecture:** Recursive Descent Splitter + Kosha (Dictionary) Lookup.
|
| 60 |
+
* **Vocab Size:** 128,000.
|
| 61 |
+
* **Fallback:** Deterministic fallback: character-level only when grammar fails
|
| 62 |
+
## 📜 Citation
|
| 63 |
+
|
| 64 |
+
```bibtex
|
| 65 |
+
@misc{panini2025,
|
| 66 |
+
author = {ArthaLabs},
|
| 67 |
+
title = {Panini Tokenizer: Grammar-First Sanskrit Tokenization},
|
| 68 |
+
year = {2025},
|
| 69 |
+
publisher = {Hugging Face},
|
| 70 |
+
howpublished = {\url{https://huggingface.co/ArthaLabs/panini-tokenizer}}
|
| 71 |
+
}
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
## License
|
| 75 |
+
|
| 76 |
+
Apache 2.0
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"unk_token": "<unk>",
|
| 3 |
+
"pad_token": "<pad>",
|
| 4 |
+
"bos_token": "<bos>",
|
| 5 |
+
"eos_token": "<eos>",
|
| 6 |
+
"mask_token": "<mask>",
|
| 7 |
+
"sep_token": "<sep>",
|
| 8 |
+
"cls_token": "<cls>"
|
| 9 |
+
}
|
src/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Panini Tokenizer V3
|
| 3 |
+
Morphology-aware Sanskrit tokenizer using Vidyut.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .analyzer import VidyutAnalyzer, MorphParse
|
| 7 |
+
from .splitter import SamasaSplitter, CompoundSplit
|
| 8 |
+
from .tokenizer import PaniniTokenizerV3, create_tokenizer
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
"VidyutAnalyzer",
|
| 12 |
+
"MorphParse",
|
| 13 |
+
"SamasaSplitter",
|
| 14 |
+
"CompoundSplit",
|
| 15 |
+
"PaniniTokenizerV3",
|
| 16 |
+
"create_tokenizer",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
__version__ = "3.0.0"
|
src/analyzer.py
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vidyut Morphological Analyzer
|
| 3 |
+
Provides deterministic morphological analysis using Vidyut Kosha.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
from typing import Dict, List, Optional, Set
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
|
| 11 |
+
# --- CONFIGURATION ---
|
| 12 |
+
VIDYUT_DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vidyut_data")
|
| 13 |
+
STEMS_FILE = os.path.join(os.path.dirname(__file__), "stems.json")
|
| 14 |
+
|
| 15 |
+
# --- FAST STEM CACHE (no Kosha disk I/O during tokenization) ---
|
| 16 |
+
_STEM_CACHE: set = set()
|
| 17 |
+
_STEM_CACHE_LOADED = False
|
| 18 |
+
|
| 19 |
+
def _load_stem_cache():
|
| 20 |
+
"""Load stems from stems.json for fast lookup."""
|
| 21 |
+
global _STEM_CACHE, _STEM_CACHE_LOADED
|
| 22 |
+
if _STEM_CACHE_LOADED:
|
| 23 |
+
return
|
| 24 |
+
|
| 25 |
+
# Common Sanskrit stems (hardcoded for immediate use)
|
| 26 |
+
COMMON_STEMS = {
|
| 27 |
+
# Basic nouns
|
| 28 |
+
"rAma", "sItA", "kfzRa", "arjuna", "deva", "brahma", "Atma", "Atman",
|
| 29 |
+
"parama", "param", "para", "maha", "mahA", "rAja", "vana", "gfha",
|
| 30 |
+
"hfd", "padma", "gata", "gam", "gacC", "ti", "aH", "am", "jYa",
|
| 31 |
+
# Philosophical compounds
|
| 32 |
+
"bhedAbheda", "bheda", "abheda", "vibhAga", "yoga", "vicAra",
|
| 33 |
+
"sopAdhika", "pratyagAtman", "pratyag", "Atman", "AbhAsa", "bhAsa",
|
| 34 |
+
"kzetra", "kzetrajYa", "santoSa", "mokSa", "saMsAra", "jIva",
|
| 35 |
+
"brahman", "paramAtman", "pratyaya", "pramANa", "anumAna",
|
| 36 |
+
# Joining elements
|
| 37 |
+
"sat", "asat", "cit", "Ananda", "satcitAnanda",
|
| 38 |
+
# NO CYBER-YOGI STEMS - those need to be discovered compositionally!
|
| 39 |
+
}
|
| 40 |
+
_STEM_CACHE.update(COMMON_STEMS)
|
| 41 |
+
|
| 42 |
+
# Load from massive stems.json if available
|
| 43 |
+
if os.path.exists(STEMS_FILE):
|
| 44 |
+
try:
|
| 45 |
+
with open(STEMS_FILE, "r", encoding="utf-8") as f:
|
| 46 |
+
stems = json.load(f)
|
| 47 |
+
_STEM_CACHE.update(stems)
|
| 48 |
+
print(f" VidyutAnalyzer: Loaded {len(_STEM_CACHE)} stems from cache")
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f" VidyutAnalyzer: Stem cache load failed ({e})")
|
| 51 |
+
|
| 52 |
+
_STEM_CACHE_LOADED = True
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@dataclass
|
| 56 |
+
class MorphParse:
|
| 57 |
+
"""A single morphological parse of a word."""
|
| 58 |
+
surface: str # Original surface form
|
| 59 |
+
stem: str # The stem/prātipadika
|
| 60 |
+
root: Optional[str] # Dhātu if applicable
|
| 61 |
+
pratyaya: Optional[str] # Suffix (kṛt/taddhita)
|
| 62 |
+
vibhakti: Optional[str] # Case ending
|
| 63 |
+
upasarga: Optional[str] # Prefix
|
| 64 |
+
is_compound: bool # Is this a samāsa?
|
| 65 |
+
is_verb: bool # Is this a tiṅanta?
|
| 66 |
+
derivation_depth: int # Number of derivational steps
|
| 67 |
+
kosha_validated: bool # Is the stem in Kosha?
|
| 68 |
+
|
| 69 |
+
def token_form(self) -> str:
|
| 70 |
+
"""Return the canonical token form (stem without vibhakti)."""
|
| 71 |
+
if self.vibhakti and self.surface.endswith(self.vibhakti):
|
| 72 |
+
return self.surface[:-len(self.vibhakti)]
|
| 73 |
+
return self.stem if self.stem else self.surface
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class VidyutAnalyzer:
|
| 77 |
+
"""
|
| 78 |
+
Morphological analyzer using Vidyut Kosha.
|
| 79 |
+
Provides deterministic disambiguation for tokenization.
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
# Nominal case endings (vibhakti markers)
|
| 83 |
+
VIBHAKTI_ENDINGS = [
|
| 84 |
+
# Masculine a-stem
|
| 85 |
+
("asya", "Gen.Sg"), ("Aya", "Dat.Sg"), ("At", "Abl.Sg"),
|
| 86 |
+
("ena", "Ins.Sg"), ("e", "Loc.Sg"), ("aH", "Nom.Sg"),
|
| 87 |
+
("am", "Acc.Sg"), ("O", "Nom.Du"), ("ayoH", "Gen.Du"),
|
| 88 |
+
("ABym", "Ins.Du"), ("AH", "Nom.Pl"), ("An", "Gen.Pl"),
|
| 89 |
+
("eByo", "Dat.Pl"), ("EH", "Ins.Pl"), ("ezu", "Loc.Pl"),
|
| 90 |
+
# Feminine ā-stem
|
| 91 |
+
("AyAH", "Gen.Sg.F"), ("AyAm", "Loc.Sg.F"), ("ayA", "Ins.Sg.F"),
|
| 92 |
+
# Neuter
|
| 93 |
+
("Ani", "Nom.Pl.N"), ("AnAm", "Gen.Pl.N"),
|
| 94 |
+
# Common short
|
| 95 |
+
("sya", "Gen"), ("ya", "Dat"), ("ya", "Loc"),
|
| 96 |
+
("m", "Acc"), ("H", "Nom.Sg"),
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
# Kṛt pratyayas (verbal derivatives)
|
| 100 |
+
KRT_SUFFIXES = [
|
| 101 |
+
("tvA", "ktvā"), # Absolutive
|
| 102 |
+
("ya", "lyap"), # Absolutive with prefix
|
| 103 |
+
("ta", "kta"), # Past passive participle
|
| 104 |
+
("tavat", "ktavat"), # Past active participle
|
| 105 |
+
("at", "śatṛ"), # Present participle
|
| 106 |
+
("Ana", "śānac"), # Present participle (ātm)
|
| 107 |
+
("tum", "tumun"), # Infinitive
|
| 108 |
+
("ti", "ktin"), # Action noun
|
| 109 |
+
("ana", "lyuṭ"), # Action noun
|
| 110 |
+
("aka", "ṇvul"), # Agent noun
|
| 111 |
+
("in", "ṇini"), # Agent noun
|
| 112 |
+
("tṛ", "tṛc"), # Agent noun
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
# Taddhita suffixes (nominal derivatives)
|
| 116 |
+
TADDHITA_SUFFIXES = [
|
| 117 |
+
("tva", "tva"), # Abstract noun -ness
|
| 118 |
+
("tA", "tal"), # Abstract noun -ness
|
| 119 |
+
("maya", "mayaṭ"), # Made of
|
| 120 |
+
("vat", "vatup"), # Having
|
| 121 |
+
("mat", "matup"), # Having
|
| 122 |
+
("ika", "ṭhak"), # Related to
|
| 123 |
+
("Iya", "cha"), # Related to
|
| 124 |
+
("ya", "yat"), # Fitness
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
# Verbal form endings (tiṅanta + participles) - treat as atomic
|
| 128 |
+
VERBAL_ENDINGS = [
|
| 129 |
+
# Finite verb endings (tiṅanta)
|
| 130 |
+
"ti", "anti", "si", "Ta", "mi", "maH", "vas", "mas",
|
| 131 |
+
"te", "ante", "se", "Atte", "e", "mahi", "vahe", "mahe",
|
| 132 |
+
# Participial endings (kṛdanta declined)
|
| 133 |
+
"anto", "antaH", "antam", "antI", "antau", # Present participle
|
| 134 |
+
"ayanto", "ayantaH", "ayantam", # Causative participle
|
| 135 |
+
"mAnaH", "mAnam", "mAnA", # Present/middle participle
|
| 136 |
+
"taH", "tam", "te", "tAni", # Past participle (removed tA - causes false positive on abstract nouns)
|
| 137 |
+
"tavAn", "tavatI", "tavat", # Past active participle
|
| 138 |
+
# Removed: "ya", "yam", "yaH" - too many false positives on abstract nouns
|
| 139 |
+
]
|
| 140 |
+
|
| 141 |
+
# Upasargas (verbal prefixes)
|
| 142 |
+
UPASARGAS = [
|
| 143 |
+
"pra", "parA", "apa", "sam", "anu", "ava", "nis", "nir", "dus", "dur",
|
| 144 |
+
"vi", "A", "ni", "aDi", "api", "ati", "su", "ut", "ud", "aBi", "prati",
|
| 145 |
+
"pari", "upa",
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
def __init__(self, preload_cache: bool = True):
|
| 149 |
+
"""Initialize analyzer with fast stem cache."""
|
| 150 |
+
self._parse_cache: Dict[str, List[MorphParse]] = {}
|
| 151 |
+
|
| 152 |
+
# Load stem cache on init
|
| 153 |
+
_load_stem_cache()
|
| 154 |
+
|
| 155 |
+
def _in_kosha(self, word: str) -> bool:
|
| 156 |
+
"""Check if word exists in stem cache (O(1) lookup)."""
|
| 157 |
+
return word in _STEM_CACHE
|
| 158 |
+
|
| 159 |
+
def _is_verb_form(self, word: str) -> bool:
|
| 160 |
+
"""
|
| 161 |
+
Check if word is a verb form (tiṅanta/kṛdanta) that should be atomic.
|
| 162 |
+
Rule 3: Verbal forms = single token, no SP, no splitting.
|
| 163 |
+
"""
|
| 164 |
+
# Sort by length (longest first) to avoid partial matches
|
| 165 |
+
for ending in sorted(self.VERBAL_ENDINGS, key=len, reverse=True):
|
| 166 |
+
if word.endswith(ending) and len(word) > len(ending) + 2:
|
| 167 |
+
# Check if the remainder looks like a valid root/stem
|
| 168 |
+
remainder = word[:-len(ending)]
|
| 169 |
+
# Simple heuristic: if remainder is >= 2 chars, likely a verb form
|
| 170 |
+
if len(remainder) >= 2:
|
| 171 |
+
return True
|
| 172 |
+
return False
|
| 173 |
+
|
| 174 |
+
def _extract_vibhakti(self, word: str) -> tuple:
|
| 175 |
+
"""Extract vibhakti ending from a word. Returns (stem, vibhakti)."""
|
| 176 |
+
for ending, _ in sorted(self.VIBHAKTI_ENDINGS, key=lambda x: -len(x[0])):
|
| 177 |
+
if word.endswith(ending) and len(word) > len(ending) + 1:
|
| 178 |
+
stem = word[:-len(ending)]
|
| 179 |
+
# Validate stem exists
|
| 180 |
+
for suffix in ["", "a", "A", "i", "I", "u", "U"]:
|
| 181 |
+
test = stem + suffix
|
| 182 |
+
if self._in_kosha(test):
|
| 183 |
+
return (test, ending)
|
| 184 |
+
# Return anyway with original stem
|
| 185 |
+
return (stem, ending)
|
| 186 |
+
return (word, None)
|
| 187 |
+
|
| 188 |
+
def _extract_upasarga(self, word: str) -> tuple:
|
| 189 |
+
"""Extract upasarga prefix. Returns (upasarga, remainder)."""
|
| 190 |
+
for upa in sorted(self.UPASARGAS, key=len, reverse=True):
|
| 191 |
+
if word.startswith(upa) and len(word) > len(upa) + 2:
|
| 192 |
+
remainder = word[len(upa):]
|
| 193 |
+
# Strengthened validation: require Kosha match or valid prefix
|
| 194 |
+
# Avoids false positives like pratyag → prati + junk
|
| 195 |
+
if self._in_kosha(remainder):
|
| 196 |
+
return (upa, remainder)
|
| 197 |
+
# Also check if remainder starts with a valid stem
|
| 198 |
+
for j in range(3, min(len(remainder), 10)):
|
| 199 |
+
if self._in_kosha(remainder[:j]):
|
| 200 |
+
return (upa, remainder)
|
| 201 |
+
return (None, word)
|
| 202 |
+
|
| 203 |
+
def _extract_pratyaya(self, word: str) -> tuple:
|
| 204 |
+
"""Extract kṛt/taddhita suffix. Returns (stem, pratyaya_type)."""
|
| 205 |
+
# Try kṛt first
|
| 206 |
+
for suffix, ptype in sorted(self.KRT_SUFFIXES, key=lambda x: -len(x[0])):
|
| 207 |
+
if word.endswith(suffix) and len(word) > len(suffix) + 1:
|
| 208 |
+
stem = word[:-len(suffix)]
|
| 209 |
+
if self._in_kosha(stem) or len(stem) >= 2:
|
| 210 |
+
return (stem, ptype)
|
| 211 |
+
|
| 212 |
+
# Try taddhita
|
| 213 |
+
for suffix, ptype in sorted(self.TADDHITA_SUFFIXES, key=lambda x: -len(x[0])):
|
| 214 |
+
if word.endswith(suffix) and len(word) > len(suffix) + 1:
|
| 215 |
+
stem = word[:-len(suffix)]
|
| 216 |
+
if self._in_kosha(stem) or len(stem) >= 2:
|
| 217 |
+
return (stem, ptype)
|
| 218 |
+
|
| 219 |
+
return (word, None)
|
| 220 |
+
|
| 221 |
+
def analyze(self, word: str) -> List[MorphParse]:
|
| 222 |
+
"""
|
| 223 |
+
Analyze a word and return all possible parses.
|
| 224 |
+
Parses are sorted by preference (deterministic order).
|
| 225 |
+
"""
|
| 226 |
+
if not word or len(word) < 2:
|
| 227 |
+
return [MorphParse(
|
| 228 |
+
surface=word, stem=word, root=None, pratyaya=None,
|
| 229 |
+
vibhakti=None, upasarga=None, is_compound=False,
|
| 230 |
+
is_verb=False, derivation_depth=0, kosha_validated=False
|
| 231 |
+
)]
|
| 232 |
+
|
| 233 |
+
if word in self._parse_cache:
|
| 234 |
+
return self._parse_cache[word]
|
| 235 |
+
|
| 236 |
+
parses = []
|
| 237 |
+
|
| 238 |
+
# Parse 0: Verb form detection (Rule 3 - atomic verbs)
|
| 239 |
+
# Check this FIRST so is_verb flag is set for downstream logic
|
| 240 |
+
if self._is_verb_form(word):
|
| 241 |
+
parses.append(MorphParse(
|
| 242 |
+
surface=word, stem=word, root=None, pratyaya=None,
|
| 243 |
+
vibhakti=None, upasarga=None, is_compound=False,
|
| 244 |
+
is_verb=True, derivation_depth=0, kosha_validated=True
|
| 245 |
+
))
|
| 246 |
+
# Return early - verb forms are atomic
|
| 247 |
+
self._parse_cache[word] = parses
|
| 248 |
+
return parses
|
| 249 |
+
|
| 250 |
+
# Parse 1: Direct Kosha lookup (simplest)
|
| 251 |
+
if self._in_kosha(word):
|
| 252 |
+
parses.append(MorphParse(
|
| 253 |
+
surface=word, stem=word, root=None, pratyaya=None,
|
| 254 |
+
vibhakti=None, upasarga=None, is_compound=False,
|
| 255 |
+
is_verb=False, derivation_depth=0, kosha_validated=True
|
| 256 |
+
))
|
| 257 |
+
|
| 258 |
+
# Parse 2: Vibhakti extraction
|
| 259 |
+
stem, vibhakti = self._extract_vibhakti(word)
|
| 260 |
+
if vibhakti:
|
| 261 |
+
parses.append(MorphParse(
|
| 262 |
+
surface=word, stem=stem, root=None, pratyaya=None,
|
| 263 |
+
vibhakti=vibhakti, upasarga=None, is_compound=False,
|
| 264 |
+
is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(stem)
|
| 265 |
+
))
|
| 266 |
+
|
| 267 |
+
# Parse 3: Upasarga + stem
|
| 268 |
+
upasarga, remainder = self._extract_upasarga(word)
|
| 269 |
+
if upasarga:
|
| 270 |
+
parses.append(MorphParse(
|
| 271 |
+
surface=word, stem=remainder, root=None, pratyaya=None,
|
| 272 |
+
vibhakti=None, upasarga=upasarga, is_compound=False,
|
| 273 |
+
is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(remainder)
|
| 274 |
+
))
|
| 275 |
+
|
| 276 |
+
# Parse 4: Pratyaya extraction
|
| 277 |
+
prat_stem, pratyaya = self._extract_pratyaya(word)
|
| 278 |
+
if pratyaya:
|
| 279 |
+
parses.append(MorphParse(
|
| 280 |
+
surface=word, stem=prat_stem, root=prat_stem, pratyaya=pratyaya,
|
| 281 |
+
vibhakti=None, upasarga=None, is_compound=False,
|
| 282 |
+
is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(prat_stem)
|
| 283 |
+
))
|
| 284 |
+
|
| 285 |
+
# Fallback: surface form as stem
|
| 286 |
+
if not parses:
|
| 287 |
+
parses.append(MorphParse(
|
| 288 |
+
surface=word, stem=word, root=None, pratyaya=None,
|
| 289 |
+
vibhakti=None, upasarga=None, is_compound=False,
|
| 290 |
+
is_verb=False, derivation_depth=0, kosha_validated=False
|
| 291 |
+
))
|
| 292 |
+
|
| 293 |
+
# Sort by preference (deterministic)
|
| 294 |
+
parses = self._disambiguate(parses)
|
| 295 |
+
|
| 296 |
+
self._parse_cache[word] = parses
|
| 297 |
+
return parses
|
| 298 |
+
|
| 299 |
+
def _disambiguate(self, parses: List[MorphParse]) -> List[MorphParse]:
|
| 300 |
+
"""
|
| 301 |
+
Deterministic disambiguation. NO randomness, NO frequency.
|
| 302 |
+
|
| 303 |
+
Priority:
|
| 304 |
+
1. Prefer fewer derivational splits
|
| 305 |
+
2. Prefer Kosha-validated stems
|
| 306 |
+
3. Prefer non-compound over compound
|
| 307 |
+
"""
|
| 308 |
+
def sort_key(p: MorphParse) -> tuple:
|
| 309 |
+
return (
|
| 310 |
+
p.derivation_depth, # Fewer splits first
|
| 311 |
+
0 if p.kosha_validated else 1, # Kosha-validated first
|
| 312 |
+
1 if p.is_compound else 0, # Non-compound first
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
return sorted(parses, key=sort_key)
|
| 316 |
+
|
| 317 |
+
def get_best_parse(self, word: str) -> MorphParse:
|
| 318 |
+
"""Get the single best (deterministic) parse for a word."""
|
| 319 |
+
parses = self.analyze(word)
|
| 320 |
+
return parses[0] if parses else MorphParse(
|
| 321 |
+
surface=word, stem=word, root=None, pratyaya=None,
|
| 322 |
+
vibhakti=None, upasarga=None, is_compound=False,
|
| 323 |
+
is_verb=False, derivation_depth=0, kosha_validated=False
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
# --- TEST ---
|
| 328 |
+
if __name__ == "__main__":
|
| 329 |
+
print("Testing VidyutAnalyzer...")
|
| 330 |
+
analyzer = VidyutAnalyzer(preload_cache=True)
|
| 331 |
+
|
| 332 |
+
test_words = [
|
| 333 |
+
"rAmaH", "gacCati", "paramAtma", "hfdpadmagataM",
|
| 334 |
+
"sopAdhika", "bhAva", "abheda", "vicAraH"
|
| 335 |
+
]
|
| 336 |
+
|
| 337 |
+
for word in test_words:
|
| 338 |
+
parse = analyzer.get_best_parse(word)
|
| 339 |
+
print(f" {word:20} → stem: {parse.stem:15} vibhakti: {parse.vibhakti or '-':8} kosha: {parse.kosha_validated}")
|
src/splitter.py
ADDED
|
@@ -0,0 +1,722 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Samāsa (Compound) Splitter
|
| 3 |
+
Detects and splits Sanskrit compound words at their boundaries.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Tuple, Optional
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
|
| 9 |
+
# Import analyzer for Kosha access
|
| 10 |
+
from .analyzer import VidyutAnalyzer, MorphParse
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class CompoundSplit:
|
| 15 |
+
"""Result of compound splitting."""
|
| 16 |
+
surface: str # Original compound
|
| 17 |
+
components: List[str] # Split components
|
| 18 |
+
split_points: List[int] # Character positions of splits
|
| 19 |
+
is_compound: bool # Was this actually a compound?
|
| 20 |
+
compound_type: Optional[str] # tatpuruṣa, dvandva, bahuvrīhi, etc.
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class SamasaSplitter:
|
| 24 |
+
"""
|
| 25 |
+
Splits Sanskrit compound words (samāsa) at their boundaries.
|
| 26 |
+
Uses Kosha lookups to validate potential split points.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
# Common compound final elements (uttarapada patterns)
|
| 30 |
+
COMPOUND_FINALS = [
|
| 31 |
+
"kara", "kAra", "kArin", "kft", "kftya", # Doer
|
| 32 |
+
"gata", "gati", "gamana", # Going
|
| 33 |
+
"ja", "jAta", "janman", # Born
|
| 34 |
+
"Da", "DAra", "DAraka", "DArin", # Holding
|
| 35 |
+
"maya", "mat", "vat", # Having/made of
|
| 36 |
+
"pati", "nATa", "ISvara", "adhipa", # Lord
|
| 37 |
+
"Atman", "rUpa", "svarUpa", # Self/form
|
| 38 |
+
"pada", "pAduka", # Foot/step
|
| 39 |
+
"stha", "sthita", "sthAna", # Standing/place
|
| 40 |
+
"yukta", "hIna", "rahita", # With/without
|
| 41 |
+
"priya", "rata", "ASrita", # Loving/devoted
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
# Common compound first elements (pūrvapada patterns)
|
| 45 |
+
COMPOUND_INITIALS = [
|
| 46 |
+
"mahA", "ati", "su", "dur", "sat", "a", "an", # Prefixes
|
| 47 |
+
"sarva", "viSva", "eka", "bahu", # All/one/many
|
| 48 |
+
"deva", "brahma", "Atma", "para", # Divine/supreme
|
| 49 |
+
"rAja", "mahI", "loka", # King/earth/world
|
| 50 |
+
"hfd", "manas", "citta", # Heart/mind
|
| 51 |
+
"padma", "kamala", # Lotus
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
|
| 55 |
+
"""Initialize with optional shared analyzer."""
|
| 56 |
+
self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
|
| 57 |
+
|
| 58 |
+
# Sandhi reversal rules: (surface_ending, possible_original_endings)
|
| 59 |
+
# These are common consonant/vowel Sandhi transformations to reverse
|
| 60 |
+
SANDHI_REVERSIONS = {
|
| 61 |
+
# Consonant Sandhi (final consonant before vowel)
|
| 62 |
+
'd': ['t', 'd'], # vidyud -> vidyut
|
| 63 |
+
'g': ['k', 'g'], # vAg -> vAk
|
| 64 |
+
'b': ['p', 'b'], # ap -> ab (water)
|
| 65 |
+
'D': ['T', 'D'], #
|
| 66 |
+
'j': ['c', 'j'], #
|
| 67 |
+
'z': ['s', 'z'], #
|
| 68 |
+
# Vowel Sandhi (vowel combinations)
|
| 69 |
+
'A': ['a', 'A'], # a+a -> A
|
| 70 |
+
'I': ['i', 'I'], # i+i -> I
|
| 71 |
+
'U': ['u', 'U'], # u+u -> U
|
| 72 |
+
'e': ['a', 'i'], # a+i -> e
|
| 73 |
+
'o': ['a', 'u'], # a+u -> o
|
| 74 |
+
'ai': ['a', 'e'], # a+e -> ai
|
| 75 |
+
'au': ['a', 'o'], # a+o -> au
|
| 76 |
+
# Consonant clusters
|
| 77 |
+
'cC': ['t', 'c'], # t+c -> cC
|
| 78 |
+
'jj': ['d', 'j'], # d+j -> jj
|
| 79 |
+
'DD': ['D', 'D'], #
|
| 80 |
+
# Visarga Sandhi
|
| 81 |
+
'o': ['aH'], # aH + vowel -> o
|
| 82 |
+
'ar': ['aH'], # aH + r -> ar
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
def _try_sandhi_reversal(self, surface: str, min_stem_len: int = 3) -> List[str]:
|
| 86 |
+
"""
|
| 87 |
+
Try to recover original stems from Sandhi-modified surface forms.
|
| 88 |
+
Returns list of possible original forms, ordered by likelihood.
|
| 89 |
+
"""
|
| 90 |
+
candidates = [surface] # Original form is always a candidate
|
| 91 |
+
|
| 92 |
+
# TRANSLITERATION NORMALIZATION (lowercase digraph → SLP1 single char)
|
| 93 |
+
# This handles: bh→B, dh→D, gh→G, ph→P, th→T, kh→K, ch→C, jh→J
|
| 94 |
+
TRANSLIT_MAP = [
|
| 95 |
+
('bh', 'B'), ('dh', 'D'), ('gh', 'G'), ('ph', 'P'),
|
| 96 |
+
('th', 'T'), ('kh', 'K'), ('ch', 'C'), ('jh', 'J'),
|
| 97 |
+
('Th', 'W'), ('Dh', 'Q'), # Retroflex aspirates
|
| 98 |
+
]
|
| 99 |
+
normalized = surface
|
| 100 |
+
for digraph, single in TRANSLIT_MAP:
|
| 101 |
+
normalized = normalized.replace(digraph, single)
|
| 102 |
+
if normalized != surface:
|
| 103 |
+
candidates.append(normalized)
|
| 104 |
+
|
| 105 |
+
# Try consonant Sandhi at word boundary (last char)
|
| 106 |
+
for form in [surface, normalized]:
|
| 107 |
+
if len(form) >= min_stem_len and form[-1] in self.SANDHI_REVERSIONS:
|
| 108 |
+
for original in self.SANDHI_REVERSIONS[form[-1]]:
|
| 109 |
+
candidate = form[:-1] + original
|
| 110 |
+
if candidate not in candidates:
|
| 111 |
+
candidates.append(candidate)
|
| 112 |
+
|
| 113 |
+
# Try internal Sandhi (for compound-internal changes)
|
| 114 |
+
# e.g., buddhy -> buddhi (y often represents elided i)
|
| 115 |
+
for form in [surface, normalized]:
|
| 116 |
+
if form.endswith('y') and len(form) >= min_stem_len:
|
| 117 |
+
candidates.append(form[:-1] + 'i') # Try y -> i
|
| 118 |
+
if form.endswith('v') and len(form) >= min_stem_len:
|
| 119 |
+
candidates.append(form[:-1] + 'u') # Try v -> u
|
| 120 |
+
|
| 121 |
+
# Remove duplicates while preserving order
|
| 122 |
+
seen = set()
|
| 123 |
+
unique = []
|
| 124 |
+
for c in candidates:
|
| 125 |
+
if c not in seen:
|
| 126 |
+
seen.add(c)
|
| 127 |
+
unique.append(c)
|
| 128 |
+
|
| 129 |
+
return unique
|
| 130 |
+
|
| 131 |
+
def _is_valid_stem(self, surface: str) -> bool:
|
| 132 |
+
"""
|
| 133 |
+
Check if a surface form is a valid stem, trying:
|
| 134 |
+
1. Direct Kosha lookup
|
| 135 |
+
2. Sandhi reversal
|
| 136 |
+
3. Pratyaya (suffix) stripping
|
| 137 |
+
"""
|
| 138 |
+
if len(surface) < 2:
|
| 139 |
+
return False
|
| 140 |
+
|
| 141 |
+
# Try all Sandhi reversal candidates
|
| 142 |
+
candidates = self._try_sandhi_reversal(surface)
|
| 143 |
+
for candidate in candidates:
|
| 144 |
+
if self.analyzer._in_kosha(candidate):
|
| 145 |
+
return True
|
| 146 |
+
# Also try vowel adjustments
|
| 147 |
+
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
|
| 148 |
+
return True
|
| 149 |
+
if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
|
| 150 |
+
return True
|
| 151 |
+
if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
|
| 152 |
+
return True
|
| 153 |
+
|
| 154 |
+
# Try PRATYAYA STRIPPING (grammatical suffix removal)
|
| 155 |
+
# This is Panini's kRt/taddhita system - generalizes to ALL Sanskrit
|
| 156 |
+
PRATYAYAS = [
|
| 157 |
+
('ana', 3), # lyuT: action noun (karaNa from kR)
|
| 158 |
+
('Ana', 3), # śānac: present participle
|
| 159 |
+
('tva', 3), # tva: abstract noun (devatva from deva)
|
| 160 |
+
('tA', 2), # tal: abstract noun (sundaratA)
|
| 161 |
+
('ya', 2), # yat: fitness/gerundive
|
| 162 |
+
('ta', 2), # kta: past participle
|
| 163 |
+
('ti', 2), # ktin: action noun
|
| 164 |
+
('in', 2), # ṇini: possessor
|
| 165 |
+
('ika', 3), # ṭhak: related to
|
| 166 |
+
('Iya', 3), # cha: related to
|
| 167 |
+
]
|
| 168 |
+
|
| 169 |
+
for suffix, min_root in PRATYAYAS:
|
| 170 |
+
if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
|
| 171 |
+
root = surface[:-len(suffix)]
|
| 172 |
+
# Try the root in Kosha
|
| 173 |
+
if self.analyzer._in_kosha(root):
|
| 174 |
+
return True
|
| 175 |
+
# Try Sandhi reversal on root
|
| 176 |
+
for r in self._try_sandhi_reversal(root):
|
| 177 |
+
if self.analyzer._in_kosha(r):
|
| 178 |
+
return True
|
| 179 |
+
|
| 180 |
+
return False
|
| 181 |
+
|
| 182 |
+
def _count_kosha_heads(self, surface: str, min_head_len: int = 5) -> int:
|
| 183 |
+
"""
|
| 184 |
+
FIX 2: Count how many valid kosha stems exist inside a long string.
|
| 185 |
+
Used to detect mega-tokens that swallowed multiple stems.
|
| 186 |
+
"""
|
| 187 |
+
if len(surface) < min_head_len * 2:
|
| 188 |
+
return 1 if self._is_valid_stem(surface) else 0
|
| 189 |
+
|
| 190 |
+
heads = 0
|
| 191 |
+
i = 0
|
| 192 |
+
while i < len(surface) - min_head_len + 1:
|
| 193 |
+
# Try to find a valid stem starting at position i
|
| 194 |
+
for j in range(min(len(surface), i + 15), i + min_head_len - 1, -1):
|
| 195 |
+
candidate = surface[i:j]
|
| 196 |
+
if len(candidate) >= min_head_len and self._is_valid_stem(candidate):
|
| 197 |
+
heads += 1
|
| 198 |
+
i = j # Skip past this head
|
| 199 |
+
break
|
| 200 |
+
else:
|
| 201 |
+
i += 1
|
| 202 |
+
return max(heads, 1 if self._is_valid_stem(surface) else 0)
|
| 203 |
+
|
| 204 |
+
def _is_krdanta(self, surface: str) -> bool:
|
| 205 |
+
"""
|
| 206 |
+
FIX 3: Recognize kṛdanta (verbal derivative) forms.
|
| 207 |
+
These should be kept as units, not split further.
|
| 208 |
+
|
| 209 |
+
Kṛdanta indicators:
|
| 210 |
+
- Ends with participial suffix preceded by verbal root
|
| 211 |
+
- The whole form is in kosha as a recognized derivative
|
| 212 |
+
"""
|
| 213 |
+
KRDANTA_SUFFIXES = [
|
| 214 |
+
('mAna', 4), # Present participle (ātmanepada)
|
| 215 |
+
('Ana', 3), # Present participle
|
| 216 |
+
('tavat', 5), # Past active participle
|
| 217 |
+
('ta', 2), # Past passive participle (kta)
|
| 218 |
+
('in', 2), # Agent noun (ṇini)
|
| 219 |
+
('aka', 3), # Agent noun (ṇvul)
|
| 220 |
+
('tR', 2), # Agent noun (tṛc)
|
| 221 |
+
]
|
| 222 |
+
|
| 223 |
+
for suffix, min_root in KRDANTA_SUFFIXES:
|
| 224 |
+
if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
|
| 225 |
+
root = surface[:-len(suffix)]
|
| 226 |
+
# Check if root looks like a valid verbal root
|
| 227 |
+
# Valid roots are usually in kosha
|
| 228 |
+
for candidate in self._try_sandhi_reversal(root):
|
| 229 |
+
if self.analyzer._in_kosha(candidate):
|
| 230 |
+
return True
|
| 231 |
+
return False
|
| 232 |
+
|
| 233 |
+
def _recursive_split(self, word: str, memo: dict = None) -> List[str]:
|
| 234 |
+
"""
|
| 235 |
+
Recursively split a compound into maximal valid components.
|
| 236 |
+
|
| 237 |
+
IMPROVED ALGORITHM with three fixes:
|
| 238 |
+
1. FIX 1: Derivational spine continuation - keep collapsing if stem+suffix both valid
|
| 239 |
+
2. FIX 2: Multi-head splitting - if token has multiple kosha heads, force split
|
| 240 |
+
3. FIX 3: Kṛdanta recognition - keep participles as atomic units
|
| 241 |
+
|
| 242 |
+
Uses memoization to avoid exponential blowup.
|
| 243 |
+
"""
|
| 244 |
+
if memo is None:
|
| 245 |
+
memo = {}
|
| 246 |
+
|
| 247 |
+
if word in memo:
|
| 248 |
+
return memo[word]
|
| 249 |
+
|
| 250 |
+
# FIX 3: If it's a recognized kṛdanta, keep it atomic
|
| 251 |
+
if self._is_krdanta(word) and self._is_valid_stem(word):
|
| 252 |
+
memo[word] = [word]
|
| 253 |
+
return [word]
|
| 254 |
+
|
| 255 |
+
# FIX 2: Force split if token is long and contains multiple kosha heads
|
| 256 |
+
MAX_TOKEN_LEN = 15 # Tokens longer than this that have multiple heads must split
|
| 257 |
+
if len(word) > MAX_TOKEN_LEN:
|
| 258 |
+
head_count = self._count_kosha_heads(word)
|
| 259 |
+
if head_count > 1:
|
| 260 |
+
# Don't return early - we MUST try to split this
|
| 261 |
+
pass # Continue to splitting logic
|
| 262 |
+
else:
|
| 263 |
+
# Single head or no heads - if valid, keep it
|
| 264 |
+
if self._is_valid_stem(word):
|
| 265 |
+
memo[word] = [word]
|
| 266 |
+
return [word]
|
| 267 |
+
else:
|
| 268 |
+
# Base case: if word itself is valid AND not too long, return it
|
| 269 |
+
if self._is_valid_stem(word):
|
| 270 |
+
memo[word] = [word]
|
| 271 |
+
return [word]
|
| 272 |
+
|
| 273 |
+
# Base case: too short to split
|
| 274 |
+
if len(word) < 4:
|
| 275 |
+
memo[word] = [word]
|
| 276 |
+
return [word]
|
| 277 |
+
|
| 278 |
+
best_parse = [word] # Default: no split
|
| 279 |
+
best_score = -1000 # Start negative to ensure any valid split wins
|
| 280 |
+
|
| 281 |
+
min_len = 3 # Minimum 3 chars to prevent rA, nA splits
|
| 282 |
+
|
| 283 |
+
# Try all split points
|
| 284 |
+
for i in range(min_len, len(word) - min_len + 1):
|
| 285 |
+
left = word[:i]
|
| 286 |
+
right = word[i:]
|
| 287 |
+
|
| 288 |
+
# Check if left is valid (with Sandhi reversal)
|
| 289 |
+
if self._is_valid_stem(left):
|
| 290 |
+
# FIX 1: Derivational spine continuation
|
| 291 |
+
# If left is a valid stem, check if left+next_suffix also forms a valid stem
|
| 292 |
+
# This prevents over-splitting inside known words like bhAvanA
|
| 293 |
+
spine_continued = False
|
| 294 |
+
for ext_len in range(3, min(len(right) + 1, 8)): # Try extending by 3-7 chars
|
| 295 |
+
extended = left + right[:ext_len]
|
| 296 |
+
if self._is_valid_stem(extended):
|
| 297 |
+
# The spine continues! Don't split here, try a longer left
|
| 298 |
+
spine_continued = True
|
| 299 |
+
break
|
| 300 |
+
|
| 301 |
+
# Only split if spine doesn't continue OR if we're at a very long boundary
|
| 302 |
+
if spine_continued and len(left) < 10:
|
| 303 |
+
continue # Skip this split point, try longer
|
| 304 |
+
|
| 305 |
+
# Recursively split the right side
|
| 306 |
+
right_parse = self._recursive_split(right, memo)
|
| 307 |
+
|
| 308 |
+
# Count valid components in this parse
|
| 309 |
+
full_parse = [left] + right_parse
|
| 310 |
+
valid_count = sum(1 for comp in full_parse if self._is_valid_stem(comp))
|
| 311 |
+
|
| 312 |
+
# IMPROVED SCORING:
|
| 313 |
+
# 1. Reward valid components heavily
|
| 314 |
+
# 2. PENALIZE many components (prefer fewer, longer splits)
|
| 315 |
+
# 3. PENALIZE short components (< 5 chars)
|
| 316 |
+
# 4. REWARD if components are known kosha stems (not just valid via suffix)
|
| 317 |
+
num_components = len(full_parse)
|
| 318 |
+
avg_len = sum(len(c) for c in full_parse) / num_components
|
| 319 |
+
short_penalty = sum(1 for c in full_parse if len(c) < 5)
|
| 320 |
+
|
| 321 |
+
# Bonus for components that are DIRECTLY in kosha (not via suffix stripping)
|
| 322 |
+
direct_kosha_bonus = sum(10 for c in full_parse
|
| 323 |
+
if self.analyzer._in_kosha(c) or
|
| 324 |
+
any(self.analyzer._in_kosha(x) for x in self._try_sandhi_reversal(c)))
|
| 325 |
+
|
| 326 |
+
# Score formula: favor valid + long + few components + direct kosha
|
| 327 |
+
score = (valid_count * 100 # Valid components matter most
|
| 328 |
+
- num_components * 15 # Penalize many splits (reduced from 20)
|
| 329 |
+
+ avg_len * 5 # Reward longer components
|
| 330 |
+
- short_penalty * 40 # Penalize short fragments (reduced from 50)
|
| 331 |
+
+ direct_kosha_bonus) # Bonus for direct kosha stems
|
| 332 |
+
|
| 333 |
+
if score > best_score:
|
| 334 |
+
best_score = score
|
| 335 |
+
best_parse = full_parse
|
| 336 |
+
|
| 337 |
+
memo[word] = best_parse
|
| 338 |
+
return best_parse
|
| 339 |
+
|
| 340 |
+
def _longest_left_split(self, word: str) -> Optional[Tuple[str, str]]:
|
| 341 |
+
"""
|
| 342 |
+
Find the longest valid left stem greedily WITH SANDHI REVERSAL.
|
| 343 |
+
|
| 344 |
+
For unknown prefixes, tries consonant/vowel Sandhi reversions:
|
| 345 |
+
- vidyud -> vidyut (d -> t before vowel)
|
| 346 |
+
- buddhy -> buddhi (y -> i for elided vowel)
|
| 347 |
+
"""
|
| 348 |
+
min_len = 3 # Minimum valid stem length
|
| 349 |
+
|
| 350 |
+
# Scan from longest left to shortest
|
| 351 |
+
for i in range(len(word) - min_len, min_len - 1, -1):
|
| 352 |
+
left = word[:i]
|
| 353 |
+
right = word[i:]
|
| 354 |
+
|
| 355 |
+
# Try ALL Sandhi reversal candidates for left
|
| 356 |
+
left_valid = False
|
| 357 |
+
left_candidates = self._try_sandhi_reversal(left)
|
| 358 |
+
for candidate in left_candidates:
|
| 359 |
+
if self.analyzer._in_kosha(candidate):
|
| 360 |
+
left_valid = True
|
| 361 |
+
break
|
| 362 |
+
# Also try with vowel adjustments
|
| 363 |
+
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
|
| 364 |
+
left_valid = True
|
| 365 |
+
break
|
| 366 |
+
if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
|
| 367 |
+
left_valid = True
|
| 368 |
+
break
|
| 369 |
+
if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
|
| 370 |
+
left_valid = True
|
| 371 |
+
break
|
| 372 |
+
|
| 373 |
+
if left_valid and len(right) >= min_len:
|
| 374 |
+
# Check if right is valid using Sandhi reversal
|
| 375 |
+
right_valid = False
|
| 376 |
+
right_candidates = self._try_sandhi_reversal(right)
|
| 377 |
+
for candidate in right_candidates:
|
| 378 |
+
if self.analyzer._in_kosha(candidate):
|
| 379 |
+
right_valid = True
|
| 380 |
+
break
|
| 381 |
+
# Try with vowel adjustments
|
| 382 |
+
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
|
| 383 |
+
right_valid = True
|
| 384 |
+
break
|
| 385 |
+
|
| 386 |
+
# Try lookahead on right (for compound remainders)
|
| 387 |
+
if not right_valid:
|
| 388 |
+
for j in range(min_len, min(len(right), 15)):
|
| 389 |
+
prefix = right[:j]
|
| 390 |
+
# Try all Sandhi reversals on the prefix
|
| 391 |
+
prefix_candidates = self._try_sandhi_reversal(prefix)
|
| 392 |
+
for candidate in prefix_candidates:
|
| 393 |
+
if self.analyzer._in_kosha(candidate):
|
| 394 |
+
right_valid = True
|
| 395 |
+
break
|
| 396 |
+
if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
|
| 397 |
+
right_valid = True
|
| 398 |
+
break
|
| 399 |
+
if right_valid:
|
| 400 |
+
break
|
| 401 |
+
|
| 402 |
+
# Sandhi restoration: if left ended with long vowel, right may need prefix
|
| 403 |
+
if not right_valid and left.endswith('A') and right[0] not in 'aAiIuUeEoO':
|
| 404 |
+
restored = 'A' + right
|
| 405 |
+
restored_candidates = self._try_sandhi_reversal(restored)
|
| 406 |
+
for candidate in restored_candidates:
|
| 407 |
+
if self.analyzer._in_kosha(candidate):
|
| 408 |
+
right_valid = True
|
| 409 |
+
break
|
| 410 |
+
if not right_valid:
|
| 411 |
+
for j in range(min_len, min(len(restored), 12)):
|
| 412 |
+
if self.analyzer._in_kosha(restored[:j]):
|
| 413 |
+
right_valid = True
|
| 414 |
+
break
|
| 415 |
+
|
| 416 |
+
if right_valid:
|
| 417 |
+
return (left, right)
|
| 418 |
+
|
| 419 |
+
return None
|
| 420 |
+
|
| 421 |
+
def _find_split_candidates(self, word: str) -> List[int]:
|
| 422 |
+
"""Find potential split points based on stem cache validation."""
|
| 423 |
+
candidates = []
|
| 424 |
+
min_component = 2 # Minimum component length
|
| 425 |
+
|
| 426 |
+
# Endings to strip when validating
|
| 427 |
+
ENDINGS = ["M", "H", "aM", "am", "aH", "At", "ena", "Aya", "asya",
|
| 428 |
+
"e", "O", "AnAm", "A", "I", "U", "AN", "An", "i"]
|
| 429 |
+
|
| 430 |
+
for i in range(min_component, len(word) - min_component + 1):
|
| 431 |
+
left = word[:i]
|
| 432 |
+
right = word[i:]
|
| 433 |
+
|
| 434 |
+
# Check left side (try as-is, then with vowel additions/normalization)
|
| 435 |
+
left_valid = self.analyzer._in_kosha(left)
|
| 436 |
+
if not left_valid:
|
| 437 |
+
for suffix in ["a", "A", "i", "I", "u", "U"]:
|
| 438 |
+
if self.analyzer._in_kosha(left + suffix):
|
| 439 |
+
left_valid = True
|
| 440 |
+
break
|
| 441 |
+
# Sandhi reversal: if left ends with long vowel, try normalizing
|
| 442 |
+
if not left_valid and left.endswith('A'):
|
| 443 |
+
if self.analyzer._in_kosha(left[:-1] + 'a'):
|
| 444 |
+
left_valid = True
|
| 445 |
+
if not left_valid and left.endswith('I'):
|
| 446 |
+
if self.analyzer._in_kosha(left[:-1] + 'i'):
|
| 447 |
+
left_valid = True
|
| 448 |
+
if not left_valid and left.endswith('U'):
|
| 449 |
+
if self.analyzer._in_kosha(left[:-1] + 'u'):
|
| 450 |
+
left_valid = True
|
| 451 |
+
|
| 452 |
+
# Check right side (try as-is, strip endings, add vowels)
|
| 453 |
+
right_valid = self.analyzer._in_kosha(right)
|
| 454 |
+
if not right_valid:
|
| 455 |
+
# Try stripping endings
|
| 456 |
+
for ending in sorted(ENDINGS, key=len, reverse=True):
|
| 457 |
+
if right.endswith(ending) and len(right) > len(ending) + 1:
|
| 458 |
+
stripped = right[:-len(ending)]
|
| 459 |
+
if self.analyzer._in_kosha(stripped):
|
| 460 |
+
right_valid = True
|
| 461 |
+
break
|
| 462 |
+
# Also try with vowel additions
|
| 463 |
+
for suffix in ["a", "A"]:
|
| 464 |
+
if self.analyzer._in_kosha(stripped + suffix):
|
| 465 |
+
right_valid = True
|
| 466 |
+
break
|
| 467 |
+
if right_valid:
|
| 468 |
+
break
|
| 469 |
+
|
| 470 |
+
if not right_valid:
|
| 471 |
+
# Try vowel additions
|
| 472 |
+
for suffix in ["a", "A", "i", "I"]:
|
| 473 |
+
if self.analyzer._in_kosha(right + suffix):
|
| 474 |
+
right_valid = True
|
| 475 |
+
break
|
| 476 |
+
|
| 477 |
+
# Sandhi reversal for right side: if left ends with long vowel,
|
| 478 |
+
# the vowel may have absorbed initial vowel of right.
|
| 479 |
+
# Try restoring: AtmA|bhAsa -> check A+bhAsa = AbhAsa
|
| 480 |
+
if not right_valid and len(right) > 2:
|
| 481 |
+
# Check if left ends with long vowel that could have eaten something
|
| 482 |
+
if left.endswith('A') and right[0] not in 'aAiIuUeEoO':
|
| 483 |
+
# Right starts with consonant - maybe initial A was eaten
|
| 484 |
+
restored = 'A' + right
|
| 485 |
+
if self.analyzer._in_kosha(restored):
|
| 486 |
+
right_valid = True
|
| 487 |
+
elif len(restored) > 3:
|
| 488 |
+
# Try lookahead on restored
|
| 489 |
+
for j in range(3, min(len(restored), 12)):
|
| 490 |
+
if self.analyzer._in_kosha(restored[:j]):
|
| 491 |
+
right_valid = True
|
| 492 |
+
break
|
| 493 |
+
elif left.endswith('I') and right[0] not in 'aAiIuUeEoO':
|
| 494 |
+
restored = 'I' + right
|
| 495 |
+
if self.analyzer._in_kosha(restored):
|
| 496 |
+
right_valid = True
|
| 497 |
+
elif left.endswith('U') and right[0] not in 'aAiIuUeEoO':
|
| 498 |
+
restored = 'U' + right
|
| 499 |
+
if self.analyzer._in_kosha(restored):
|
| 500 |
+
right_valid = True
|
| 501 |
+
|
| 502 |
+
# Also check if right itself starts a sub-compound (Recursive Lookahead)
|
| 503 |
+
if not right_valid and len(right) > 3:
|
| 504 |
+
# Try to find ANY valid item at start of right
|
| 505 |
+
# Check prefixes of length 3 to 12
|
| 506 |
+
for j in range(3, min(len(right), 15)):
|
| 507 |
+
prefix = right[:j]
|
| 508 |
+
if self.analyzer._in_kosha(prefix):
|
| 509 |
+
right_valid = True
|
| 510 |
+
break
|
| 511 |
+
# Sandhi normalization: if prefix ends with long vowel, try short
|
| 512 |
+
# AtmA -> Atma, prAtI -> prAti, etc.
|
| 513 |
+
if prefix.endswith('A'):
|
| 514 |
+
normalized = prefix[:-1] + 'a'
|
| 515 |
+
if self.analyzer._in_kosha(normalized):
|
| 516 |
+
right_valid = True
|
| 517 |
+
break
|
| 518 |
+
elif prefix.endswith('I'):
|
| 519 |
+
normalized = prefix[:-1] + 'i'
|
| 520 |
+
if self.analyzer._in_kosha(normalized):
|
| 521 |
+
right_valid = True
|
| 522 |
+
break
|
| 523 |
+
elif prefix.endswith('U'):
|
| 524 |
+
normalized = prefix[:-1] + 'u'
|
| 525 |
+
if self.analyzer._in_kosha(normalized):
|
| 526 |
+
right_valid = True
|
| 527 |
+
break
|
| 528 |
+
|
| 529 |
+
# If still not found, check known initials
|
| 530 |
+
if not right_valid:
|
| 531 |
+
for initial in self.COMPOUND_INITIALS + list(self.COMPOUND_FINALS):
|
| 532 |
+
if right.startswith(initial) and len(initial) >= 2:
|
| 533 |
+
right_valid = True
|
| 534 |
+
break
|
| 535 |
+
|
| 536 |
+
# DEBUG
|
| 537 |
+
# if "sopAdhika" in word:
|
| 538 |
+
# print(f"Check {left} | {right} -> L:{left_valid} R:{right_valid}")
|
| 539 |
+
|
| 540 |
+
if left_valid and right_valid:
|
| 541 |
+
candidates.append(i)
|
| 542 |
+
|
| 543 |
+
return candidates
|
| 544 |
+
|
| 545 |
+
def _score_split(self, left: str, right: str) -> float:
|
| 546 |
+
"""
|
| 547 |
+
Score a potential split point. Lower is better.
|
| 548 |
+
Critically tuned to avoid over-segmentation like 'padma' -> 'pad' + 'ma'
|
| 549 |
+
"""
|
| 550 |
+
score = 0.0
|
| 551 |
+
|
| 552 |
+
# PENALIZE SHORT COMPONENTS
|
| 553 |
+
# Critical tuning:
|
| 554 |
+
# < 3 chars (1, 2) -> Heavy penalty (prevent 'ma', 'ka', 'sa')
|
| 555 |
+
# == 3 chars -> Slight penalty (allow 'hfd', 'gam', 'vid' but prefer longer)
|
| 556 |
+
if len(left) < 3: score += 5.0
|
| 557 |
+
elif len(left) == 3: score += 1.0
|
| 558 |
+
|
| 559 |
+
if len(right) < 3: score += 5.0
|
| 560 |
+
elif len(right) == 3: score += 1.0
|
| 561 |
+
|
| 562 |
+
# PREFER LONGER LEFT COMPONENT (Greedy Match)
|
| 563 |
+
# Previously we subtracted total len which was constant.
|
| 564 |
+
# Now we reward taking a bigger bite from the left.
|
| 565 |
+
# Increased to 1.0 to strongly prefer longer valid stems and overwhelm false matches
|
| 566 |
+
score -= len(left) * 1.0
|
| 567 |
+
|
| 568 |
+
# Prefer balanced splits (secondary factor)
|
| 569 |
+
# Reduced influence to let greedy match dominate
|
| 570 |
+
len_diff = abs(len(left) - len(right))
|
| 571 |
+
score += len_diff * 0.02
|
| 572 |
+
|
| 573 |
+
# Verify strict Kosha existence
|
| 574 |
+
left_valid = self.analyzer._in_kosha(left)
|
| 575 |
+
# Sandhi normalization for left: if ends with long vowel, try short
|
| 576 |
+
if not left_valid and left.endswith('A'):
|
| 577 |
+
if self.analyzer._in_kosha(left[:-1] + 'a'):
|
| 578 |
+
left_valid = True
|
| 579 |
+
if not left_valid and left.endswith('I'):
|
| 580 |
+
if self.analyzer._in_kosha(left[:-1] + 'i'):
|
| 581 |
+
left_valid = True
|
| 582 |
+
if not left_valid and left.endswith('U'):
|
| 583 |
+
if self.analyzer._in_kosha(left[:-1] + 'u'):
|
| 584 |
+
left_valid = True
|
| 585 |
+
|
| 586 |
+
right_valid = self.analyzer._in_kosha(right)
|
| 587 |
+
|
| 588 |
+
# Recursive Lookahead for Right side scoring
|
| 589 |
+
# If right matches a prefix, consider it valid (don't penalize)
|
| 590 |
+
if not right_valid and len(right) > 3:
|
| 591 |
+
for j in range(3, min(len(right), 15)):
|
| 592 |
+
prefix = right[:j]
|
| 593 |
+
if self.analyzer._in_kosha(prefix):
|
| 594 |
+
right_valid = True
|
| 595 |
+
break
|
| 596 |
+
# Sandhi normalization: if prefix ends with long vowel, try short
|
| 597 |
+
if prefix.endswith('A'):
|
| 598 |
+
normalized = prefix[:-1] + 'a'
|
| 599 |
+
if self.analyzer._in_kosha(normalized):
|
| 600 |
+
right_valid = True
|
| 601 |
+
break
|
| 602 |
+
elif prefix.endswith('I'):
|
| 603 |
+
normalized = prefix[:-1] + 'i'
|
| 604 |
+
if self.analyzer._in_kosha(normalized):
|
| 605 |
+
right_valid = True
|
| 606 |
+
break
|
| 607 |
+
elif prefix.endswith('U'):
|
| 608 |
+
normalized = prefix[:-1] + 'u'
|
| 609 |
+
if self.analyzer._in_kosha(normalized):
|
| 610 |
+
right_valid = True
|
| 611 |
+
break
|
| 612 |
+
|
| 613 |
+
# Sandhi vowel restoration for right side
|
| 614 |
+
# If left ends with long vowel & right starts with consonant,
|
| 615 |
+
# try prepending the absorbed vowel
|
| 616 |
+
if not right_valid and len(right) > 2:
|
| 617 |
+
if left.endswith('A') and right[0] not in 'aAiIuUeEoO':
|
| 618 |
+
restored = 'A' + right
|
| 619 |
+
if self.analyzer._in_kosha(restored):
|
| 620 |
+
right_valid = True
|
| 621 |
+
elif len(restored) > 3:
|
| 622 |
+
for j in range(3, min(len(restored), 12)):
|
| 623 |
+
if self.analyzer._in_kosha(restored[:j]):
|
| 624 |
+
right_valid = True
|
| 625 |
+
break
|
| 626 |
+
elif left.endswith('I') and right[0] not in 'aAiIuUeEoO':
|
| 627 |
+
restored = 'I' + right
|
| 628 |
+
if self.analyzer._in_kosha(restored):
|
| 629 |
+
right_valid = True
|
| 630 |
+
elif left.endswith('U') and right[0] not in 'aAiIuUeEoO':
|
| 631 |
+
restored = 'U' + right
|
| 632 |
+
if self.analyzer._in_kosha(restored):
|
| 633 |
+
right_valid = True
|
| 634 |
+
|
| 635 |
+
# If components are NOT in cache, heavily penalize
|
| 636 |
+
if not left_valid: score += 10.0
|
| 637 |
+
if not right_valid: score += 10.0
|
| 638 |
+
|
| 639 |
+
# Bonus for known compound patterns
|
| 640 |
+
for final in self.COMPOUND_FINALS:
|
| 641 |
+
if right.startswith(final) or right == final:
|
| 642 |
+
score -= 2.0 # Stronger bonus
|
| 643 |
+
break
|
| 644 |
+
|
| 645 |
+
for initial in self.COMPOUND_INITIALS:
|
| 646 |
+
if left == initial or left.startswith(initial):
|
| 647 |
+
score -= 2.0 # Stronger bonus
|
| 648 |
+
break
|
| 649 |
+
|
| 650 |
+
return score
|
| 651 |
+
|
| 652 |
+
def split(self, word: str, max_components: int = 4) -> CompoundSplit:
|
| 653 |
+
"""
|
| 654 |
+
Split a compound word into its components.
|
| 655 |
+
|
| 656 |
+
Uses greedy algorithm with Kosha validation.
|
| 657 |
+
Returns original word if no valid split found.
|
| 658 |
+
"""
|
| 659 |
+
if len(word) < 4:
|
| 660 |
+
return CompoundSplit(
|
| 661 |
+
surface=word, components=[word],
|
| 662 |
+
split_points=[], is_compound=False, compound_type=None
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
+
# Check if word itself is in Kosha (might not be compound)
|
| 666 |
+
# KEY FIX: If word is already a known stem (lexicalized), DO NOT SPLIT
|
| 667 |
+
# This protects 'paramAtma', 'kzetrajYa', 'sopAdhika' from being broken down
|
| 668 |
+
if self.analyzer._in_kosha(word):
|
| 669 |
+
return CompoundSplit(
|
| 670 |
+
surface=word, components=[word],
|
| 671 |
+
split_points=[], is_compound=False, compound_type=None
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
# Use RECURSIVE COMPOSITIONAL algorithm
|
| 675 |
+
# Tries ALL split points, recursively parses right sides,
|
| 676 |
+
# returns parse with MOST valid components
|
| 677 |
+
components = self._recursive_split(word)
|
| 678 |
+
|
| 679 |
+
if len(components) <= 1:
|
| 680 |
+
return CompoundSplit(
|
| 681 |
+
surface=word, components=[word],
|
| 682 |
+
split_points=[], is_compound=False, compound_type=None
|
| 683 |
+
)
|
| 684 |
+
|
| 685 |
+
# Calculate split points from components
|
| 686 |
+
split_points = []
|
| 687 |
+
pos = 0
|
| 688 |
+
for comp in components[:-1]:
|
| 689 |
+
pos += len(comp)
|
| 690 |
+
split_points.append(pos)
|
| 691 |
+
|
| 692 |
+
return CompoundSplit(
|
| 693 |
+
surface=word, components=components,
|
| 694 |
+
split_points=split_points, is_compound=True,
|
| 695 |
+
compound_type=None # We don't classify samāsa types
|
| 696 |
+
)
|
| 697 |
+
|
| 698 |
+
def split_multiple(self, words: List[str]) -> List[CompoundSplit]:
|
| 699 |
+
"""Split multiple words."""
|
| 700 |
+
return [self.split(w) for w in words]
|
| 701 |
+
|
| 702 |
+
|
| 703 |
+
# --- TEST ---
|
| 704 |
+
if __name__ == "__main__":
|
| 705 |
+
print("Testing SamasaSplitter...")
|
| 706 |
+
splitter = SamasaSplitter()
|
| 707 |
+
|
| 708 |
+
test_compounds = [
|
| 709 |
+
"hfdpadma",
|
| 710 |
+
"paramAtma",
|
| 711 |
+
"mahArAja",
|
| 712 |
+
"devadatta",
|
| 713 |
+
"rAjakumAra",
|
| 714 |
+
"sopAdhika",
|
| 715 |
+
]
|
| 716 |
+
|
| 717 |
+
for word in test_compounds:
|
| 718 |
+
result = splitter.split(word)
|
| 719 |
+
if result.is_compound:
|
| 720 |
+
print(f" {word:20} → {' + '.join(result.components)}")
|
| 721 |
+
else:
|
| 722 |
+
print(f" {word:20} → (not split)")
|
src/tokenizer.py
ADDED
|
@@ -0,0 +1,509 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Panini Tokenizer V3 - Morphology-Aware Sanskrit Tokenizer
|
| 3 |
+
HuggingFace PreTrainedTokenizer compatible.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
from typing import Dict, List, Optional, Tuple, Union
|
| 9 |
+
from collections import OrderedDict
|
| 10 |
+
|
| 11 |
+
# HuggingFace imports
|
| 12 |
+
try:
|
| 13 |
+
from transformers import PreTrainedTokenizer
|
| 14 |
+
from transformers.tokenization_utils_base import AddedToken
|
| 15 |
+
HAS_TRANSFORMERS = True
|
| 16 |
+
except ImportError:
|
| 17 |
+
HAS_TRANSFORMERS = False
|
| 18 |
+
PreTrainedTokenizer = object # Fallback
|
| 19 |
+
|
| 20 |
+
from .analyzer import VidyutAnalyzer, MorphParse
|
| 21 |
+
from .splitter import SamasaSplitter, CompoundSplit
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class PaniniTokenizerV3(PreTrainedTokenizer if HAS_TRANSFORMERS else object):
|
| 25 |
+
"""
|
| 26 |
+
Morphology-aware Sanskrit tokenizer using Vidyut.
|
| 27 |
+
|
| 28 |
+
Pipeline:
|
| 29 |
+
1. Vidyut analysis → extract morphological structure
|
| 30 |
+
2. Compound splitting → split at samāsa boundaries
|
| 31 |
+
3. Vibhakti separation → separate inflection from stem
|
| 32 |
+
4. Dynamic vocab → Kosha-backed vocabulary
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
# Special tokens
|
| 36 |
+
vocab_files_names = {"vocab_file": "vocab.json"}
|
| 37 |
+
model_input_names = ["input_ids", "attention_mask"]
|
| 38 |
+
|
| 39 |
+
def __init__(
|
| 40 |
+
self,
|
| 41 |
+
vocab_file: Optional[str] = None,
|
| 42 |
+
unk_token: str = "<unk>",
|
| 43 |
+
bos_token: str = "<s>",
|
| 44 |
+
eos_token: str = "</s>",
|
| 45 |
+
pad_token: str = "<pad>",
|
| 46 |
+
sep_token: str = "<sep>",
|
| 47 |
+
cls_token: str = "<cls>",
|
| 48 |
+
mask_token: str = "<mask>",
|
| 49 |
+
add_prefix_space: bool = True,
|
| 50 |
+
freeze_vocab: bool = False,
|
| 51 |
+
**kwargs
|
| 52 |
+
):
|
| 53 |
+
# Initialize special tokens
|
| 54 |
+
self.add_prefix_space = add_prefix_space
|
| 55 |
+
self.freeze_vocab = freeze_vocab # Prevent vocab explosion during training
|
| 56 |
+
|
| 57 |
+
# Core components
|
| 58 |
+
self.analyzer = VidyutAnalyzer(preload_cache=True)
|
| 59 |
+
self.splitter = SamasaSplitter(self.analyzer)
|
| 60 |
+
|
| 61 |
+
# Vocabulary
|
| 62 |
+
self._vocab: Dict[str, int] = {}
|
| 63 |
+
self._id_to_token: Dict[int, str] = {}
|
| 64 |
+
|
| 65 |
+
# Load or build vocab
|
| 66 |
+
if vocab_file and os.path.exists(vocab_file):
|
| 67 |
+
self._load_vocab(vocab_file)
|
| 68 |
+
else:
|
| 69 |
+
self._build_initial_vocab()
|
| 70 |
+
|
| 71 |
+
# Call parent init if using transformers
|
| 72 |
+
if HAS_TRANSFORMERS:
|
| 73 |
+
super().__init__(
|
| 74 |
+
unk_token=unk_token,
|
| 75 |
+
bos_token=bos_token,
|
| 76 |
+
eos_token=eos_token,
|
| 77 |
+
pad_token=pad_token,
|
| 78 |
+
sep_token=sep_token,
|
| 79 |
+
cls_token=cls_token,
|
| 80 |
+
mask_token=mask_token,
|
| 81 |
+
add_prefix_space=add_prefix_space,
|
| 82 |
+
**kwargs
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
def _build_initial_vocab(self):
|
| 86 |
+
"""Build initial vocabulary with special tokens and common morphemes."""
|
| 87 |
+
# Special tokens first (IDs 0-7)
|
| 88 |
+
special = ["<unk>", "<s>", "</s>", "<pad>", "<sep>", "<cls>", "<mask>", "▁"]
|
| 89 |
+
for i, tok in enumerate(special):
|
| 90 |
+
self._vocab[tok] = i
|
| 91 |
+
self._id_to_token[i] = tok
|
| 92 |
+
|
| 93 |
+
# Common vibhakti endings
|
| 94 |
+
vibhaktis = [
|
| 95 |
+
"H", "m", "am", "At", "Aya", "asya", "e", "O", "ayoH",
|
| 96 |
+
"AH", "An", "eByo", "EH", "ezu", "ena", "ABym",
|
| 97 |
+
"A", "AyAH", "AyAm", "ayA", "Ani", "AnAm",
|
| 98 |
+
"sya", "ya", "aH", "iH", "uH",
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
# Common pratyayas
|
| 102 |
+
pratyayas = [
|
| 103 |
+
"tvA", "ya", "ta", "tavat", "at", "Ana", "tum",
|
| 104 |
+
"ti", "ana", "aka", "in", "tf", "tva", "tA",
|
| 105 |
+
"maya", "vat", "mat", "ika", "Iya",
|
| 106 |
+
]
|
| 107 |
+
|
| 108 |
+
# Common upasargas
|
| 109 |
+
upasargas = [
|
| 110 |
+
"pra", "parA", "apa", "sam", "anu", "ava", "nis", "nir",
|
| 111 |
+
"vi", "A", "ni", "aDi", "api", "ati", "su", "ut", "ud",
|
| 112 |
+
"aBi", "prati", "pari", "upa", "dur", "dus",
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
# Add morphemes to vocab
|
| 116 |
+
next_id = len(self._vocab)
|
| 117 |
+
for morpheme_list in [vibhaktis, pratyayas, upasargas]:
|
| 118 |
+
for m in morpheme_list:
|
| 119 |
+
if m not in self._vocab:
|
| 120 |
+
self._vocab[m] = next_id
|
| 121 |
+
self._id_to_token[next_id] = m
|
| 122 |
+
next_id += 1
|
| 123 |
+
# Also add with space prefix
|
| 124 |
+
spaced = "▁" + m
|
| 125 |
+
if spaced not in self._vocab:
|
| 126 |
+
self._vocab[spaced] = next_id
|
| 127 |
+
self._id_to_token[next_id] = spaced
|
| 128 |
+
next_id += 1
|
| 129 |
+
|
| 130 |
+
print(f" PaniniTokenizerV3: Initial vocab size = {len(self._vocab)}")
|
| 131 |
+
|
| 132 |
+
def _load_vocab(self, vocab_file: str):
|
| 133 |
+
"""Load vocabulary from JSON file."""
|
| 134 |
+
with open(vocab_file, "r", encoding="utf-8") as f:
|
| 135 |
+
self._vocab = json.load(f)
|
| 136 |
+
self._id_to_token = {v: k for k, v in self._vocab.items()}
|
| 137 |
+
print(f" PaniniTokenizerV3: Loaded vocab size = {len(self._vocab)}")
|
| 138 |
+
|
| 139 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
| 140 |
+
"""Save vocabulary to directory."""
|
| 141 |
+
if not os.path.isdir(save_directory):
|
| 142 |
+
os.makedirs(save_directory, exist_ok=True)
|
| 143 |
+
|
| 144 |
+
vocab_file = os.path.join(
|
| 145 |
+
save_directory,
|
| 146 |
+
(filename_prefix + "-" if filename_prefix else "") + "vocab.json"
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
with open(vocab_file, "w", encoding="utf-8") as f:
|
| 150 |
+
json.dump(self._vocab, f, ensure_ascii=False, indent=2)
|
| 151 |
+
|
| 152 |
+
return (vocab_file,)
|
| 153 |
+
|
| 154 |
+
def save_pretrained(self, save_directory: str, **kwargs):
|
| 155 |
+
"""
|
| 156 |
+
Save the tokenizer to a directory (HuggingFace compatible).
|
| 157 |
+
Creates: vocab.json, tokenizer_config.json, special_tokens_map.json
|
| 158 |
+
"""
|
| 159 |
+
os.makedirs(save_directory, exist_ok=True)
|
| 160 |
+
|
| 161 |
+
# 1. Save vocabulary
|
| 162 |
+
vocab_file = os.path.join(save_directory, "vocab.json")
|
| 163 |
+
with open(vocab_file, "w", encoding="utf-8") as f:
|
| 164 |
+
json.dump(self._vocab, f, ensure_ascii=False, indent=2)
|
| 165 |
+
|
| 166 |
+
# 2. Save tokenizer config
|
| 167 |
+
config = {
|
| 168 |
+
"tokenizer_class": "PaniniTokenizerV3",
|
| 169 |
+
"vocab_size": len(self._vocab),
|
| 170 |
+
"unk_token": "<unk>",
|
| 171 |
+
"bos_token": "<s>",
|
| 172 |
+
"eos_token": "</s>",
|
| 173 |
+
"pad_token": "<pad>",
|
| 174 |
+
"sep_token": "<sep>",
|
| 175 |
+
"cls_token": "<cls>",
|
| 176 |
+
"mask_token": "<mask>",
|
| 177 |
+
"add_prefix_space": self.add_prefix_space,
|
| 178 |
+
"freeze_vocab": self.freeze_vocab,
|
| 179 |
+
}
|
| 180 |
+
config_file = os.path.join(save_directory, "tokenizer_config.json")
|
| 181 |
+
with open(config_file, "w", encoding="utf-8") as f:
|
| 182 |
+
json.dump(config, f, ensure_ascii=False, indent=2)
|
| 183 |
+
|
| 184 |
+
# 3. Save special tokens map
|
| 185 |
+
special_tokens = {
|
| 186 |
+
"unk_token": "<unk>",
|
| 187 |
+
"bos_token": "<s>",
|
| 188 |
+
"eos_token": "</s>",
|
| 189 |
+
"pad_token": "<pad>",
|
| 190 |
+
"sep_token": "<sep>",
|
| 191 |
+
"cls_token": "<cls>",
|
| 192 |
+
"mask_token": "<mask>",
|
| 193 |
+
}
|
| 194 |
+
special_file = os.path.join(save_directory, "special_tokens_map.json")
|
| 195 |
+
with open(special_file, "w", encoding="utf-8") as f:
|
| 196 |
+
json.dump(special_tokens, f, ensure_ascii=False, indent=2)
|
| 197 |
+
|
| 198 |
+
print(f"✅ Saved PaniniTokenizerV3 to {save_directory}/")
|
| 199 |
+
print(f" vocab.json: {len(self._vocab)} tokens")
|
| 200 |
+
return save_directory
|
| 201 |
+
|
| 202 |
+
@classmethod
|
| 203 |
+
def from_pretrained(cls, pretrained_path: str, **kwargs):
|
| 204 |
+
"""
|
| 205 |
+
Load a tokenizer from a directory (HuggingFace compatible).
|
| 206 |
+
"""
|
| 207 |
+
vocab_file = os.path.join(pretrained_path, "vocab.json")
|
| 208 |
+
config_file = os.path.join(pretrained_path, "tokenizer_config.json")
|
| 209 |
+
|
| 210 |
+
# Load config if exists
|
| 211 |
+
config = {}
|
| 212 |
+
if os.path.exists(config_file):
|
| 213 |
+
with open(config_file, "r", encoding="utf-8") as f:
|
| 214 |
+
config = json.load(f)
|
| 215 |
+
|
| 216 |
+
# Create tokenizer
|
| 217 |
+
tokenizer = cls(
|
| 218 |
+
vocab_file=vocab_file,
|
| 219 |
+
freeze_vocab=config.get("freeze_vocab", True),
|
| 220 |
+
add_prefix_space=config.get("add_prefix_space", True),
|
| 221 |
+
**kwargs
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
print(f"✅ Loaded PaniniTokenizerV3 from {pretrained_path}/")
|
| 225 |
+
print(f" vocab.json: {len(tokenizer._vocab)} tokens")
|
| 226 |
+
return tokenizer
|
| 227 |
+
|
| 228 |
+
@property
|
| 229 |
+
def vocab_size(self) -> int:
|
| 230 |
+
return len(self._vocab)
|
| 231 |
+
|
| 232 |
+
def get_vocab(self) -> Dict[str, int]:
|
| 233 |
+
return dict(self._vocab)
|
| 234 |
+
|
| 235 |
+
def _add_to_vocab(self, token: str) -> int:
|
| 236 |
+
"""Dynamically add a token to vocabulary."""
|
| 237 |
+
if token in self._vocab:
|
| 238 |
+
return self._vocab[token]
|
| 239 |
+
|
| 240 |
+
new_id = len(self._vocab)
|
| 241 |
+
self._vocab[token] = new_id
|
| 242 |
+
self._id_to_token[new_id] = token
|
| 243 |
+
return new_id
|
| 244 |
+
|
| 245 |
+
def _convert_token_to_id(self, token: str) -> int:
|
| 246 |
+
"""Convert token to ID, adding to vocab if needed (dynamic vocab)."""
|
| 247 |
+
if token in self._vocab:
|
| 248 |
+
return self._vocab[token]
|
| 249 |
+
|
| 250 |
+
# Freeze mode: return unk_id for unknown tokens (prevents vocab explosion)
|
| 251 |
+
if self.freeze_vocab:
|
| 252 |
+
return self._vocab.get("<unk>", 0)
|
| 253 |
+
|
| 254 |
+
# Dynamic vocab: add new tokens
|
| 255 |
+
return self._add_to_vocab(token)
|
| 256 |
+
|
| 257 |
+
def _convert_id_to_token(self, index: int) -> str:
|
| 258 |
+
"""Convert ID to token."""
|
| 259 |
+
return self._id_to_token.get(index, self.unk_token)
|
| 260 |
+
|
| 261 |
+
def _tokenize_word(self, word: str) -> List[str]:
|
| 262 |
+
"""
|
| 263 |
+
Tokenize a single word using morphological analysis.
|
| 264 |
+
|
| 265 |
+
New Grammar-Safe Pipeline (Rule A, B, C):
|
| 266 |
+
1. Parse with Vidyut (Collapse spines)
|
| 267 |
+
2. Iterative Samasa Splitting
|
| 268 |
+
3. No SP fallback for valid stems
|
| 269 |
+
"""
|
| 270 |
+
if not word:
|
| 271 |
+
return []
|
| 272 |
+
|
| 273 |
+
# Rule 3: Verbal forms (tiṅanta/kṛdanta) are atomic
|
| 274 |
+
# If word ends with verbal suffix, emit as single token without splitting
|
| 275 |
+
if self.analyzer._is_verb_form(word):
|
| 276 |
+
return ["▁" + word]
|
| 277 |
+
|
| 278 |
+
# Step 1: Get morphological parse (Derivational Collapse)
|
| 279 |
+
parse = self.analyzer.get_best_parse(word)
|
| 280 |
+
stem = parse.token_form()
|
| 281 |
+
|
| 282 |
+
# Rule A: If stem is valid in Kosha, DO NOT SPLIT further with SP
|
| 283 |
+
# Check if it's a compound that needs splitting
|
| 284 |
+
|
| 285 |
+
# Step 2: Iterative Samasa Splitting (Rule B)
|
| 286 |
+
# We split the *stem* recursively
|
| 287 |
+
|
| 288 |
+
final_tokens = []
|
| 289 |
+
|
| 290 |
+
# If the analyzer says it's a compound OR it looks like one
|
| 291 |
+
# We try to split it repeatedly
|
| 292 |
+
current_components = [stem]
|
| 293 |
+
|
| 294 |
+
# Helper: merge adjacent tokens that form known compounds
|
| 295 |
+
def merge_known_compounds(parts):
|
| 296 |
+
"""Merge adjacent parts that together form a known compound."""
|
| 297 |
+
merged = []
|
| 298 |
+
i = 0
|
| 299 |
+
while i < len(parts):
|
| 300 |
+
if i + 1 < len(parts):
|
| 301 |
+
# Try merging with Sandhi normalization
|
| 302 |
+
left = parts[i]
|
| 303 |
+
right = parts[i + 1]
|
| 304 |
+
# Handle vowel Sandhi: pratyag + AtmA → pratyagAtman
|
| 305 |
+
if left.endswith('A'):
|
| 306 |
+
candidate = left[:-1] + 'a' + right # AtmA → Atma + next
|
| 307 |
+
else:
|
| 308 |
+
candidate = left + right
|
| 309 |
+
|
| 310 |
+
# Also try: left ends with 'a' consumed by right starting with 'A'
|
| 311 |
+
# pratyag + AtmA → check if pratyagAtma or pratyagAtman in kosha
|
| 312 |
+
candidates = [candidate]
|
| 313 |
+
if left.endswith('A') and not right.startswith(('a', 'A', 'i', 'I', 'u', 'U', 'e', 'E', 'o', 'O')):
|
| 314 |
+
# Right starts with consonant but might have lost initial vowel
|
| 315 |
+
candidates.append(left + 'A' + right) # pratyagA + bhAsa
|
| 316 |
+
if self.analyzer._in_kosha(candidate):
|
| 317 |
+
merged.append(candidate)
|
| 318 |
+
i += 2
|
| 319 |
+
continue
|
| 320 |
+
# Try with Atman ending
|
| 321 |
+
atman_candidate = left[:-1] + 'an' if left.endswith('A') else left + 'an'
|
| 322 |
+
if right.endswith('A'):
|
| 323 |
+
atman_full = atman_candidate + right[:-1] + 'a'
|
| 324 |
+
else:
|
| 325 |
+
atman_full = atman_candidate
|
| 326 |
+
if len(atman_candidate) > 3 and self.analyzer._in_kosha(atman_candidate):
|
| 327 |
+
merged.append(atman_candidate)
|
| 328 |
+
# Still need to process right
|
| 329 |
+
merged.append(right)
|
| 330 |
+
i += 2
|
| 331 |
+
continue
|
| 332 |
+
merged.append(parts[i])
|
| 333 |
+
i += 1
|
| 334 |
+
return merged
|
| 335 |
+
|
| 336 |
+
# Iterative splitting until fixed point
|
| 337 |
+
MAX_PASSES = 6 # Increased for deep compounds
|
| 338 |
+
for _ in range(MAX_PASSES):
|
| 339 |
+
new_components = []
|
| 340 |
+
changed = False
|
| 341 |
+
|
| 342 |
+
# Split pass
|
| 343 |
+
for comp in current_components:
|
| 344 |
+
# Try to split this component
|
| 345 |
+
split_res = self.splitter.split(comp)
|
| 346 |
+
if split_res.is_compound and len(split_res.components) > 1:
|
| 347 |
+
new_components.extend(split_res.components)
|
| 348 |
+
changed = True
|
| 349 |
+
else:
|
| 350 |
+
# Sandhi restoration retry: if starts with consonant, NO split found,
|
| 351 |
+
# AND token is NOT valid (it's an OOV leftover from previous split),
|
| 352 |
+
# try prepending 'A' (initial vowel eaten in Sandhi)
|
| 353 |
+
# FIXED: Use _is_valid_stem (includes pratyaya stripping) not just _in_kosha
|
| 354 |
+
if (len(comp) > 3 and
|
| 355 |
+
comp[0] not in 'aAiIuUeEoO' and
|
| 356 |
+
not self.splitter._is_valid_stem(comp)): # Guard: only for truly invalid OOV
|
| 357 |
+
restored = 'A' + comp
|
| 358 |
+
restored_res = self.splitter.split(restored)
|
| 359 |
+
if restored_res.is_compound and len(restored_res.components) > 1:
|
| 360 |
+
# Map result back: first component keeps A prefix
|
| 361 |
+
new_components.extend(restored_res.components)
|
| 362 |
+
changed = True
|
| 363 |
+
continue
|
| 364 |
+
new_components.append(comp)
|
| 365 |
+
|
| 366 |
+
# Merge pass: merge adjacent tokens that form known compounds
|
| 367 |
+
merged_components = merge_known_compounds(new_components)
|
| 368 |
+
if len(merged_components) != len(new_components):
|
| 369 |
+
changed = True
|
| 370 |
+
|
| 371 |
+
if not changed:
|
| 372 |
+
break
|
| 373 |
+
current_components = merged_components
|
| 374 |
+
|
| 375 |
+
# Add tokens with spacing
|
| 376 |
+
for i, comp in enumerate(current_components):
|
| 377 |
+
# Rule A Violation Check:
|
| 378 |
+
# If 'comp' is in Kosha, use it AS IS.
|
| 379 |
+
# Only fall back to char/subword if it's garbage.
|
| 380 |
+
|
| 381 |
+
prefix = "▁" if i == 0 else ""
|
| 382 |
+
|
| 383 |
+
if self.analyzer._in_kosha(comp):
|
| 384 |
+
# Valid stem -> Atomic Token
|
| 385 |
+
final_tokens.append(prefix + comp)
|
| 386 |
+
else:
|
| 387 |
+
# OOV -> Only then maybe SP (but here we just keep as is for now)
|
| 388 |
+
# Ideally we want to mark it or maybe split chars if desperate
|
| 389 |
+
final_tokens.append(prefix + comp)
|
| 390 |
+
|
| 391 |
+
# Append vibhakti if separated (only for the last component usually)
|
| 392 |
+
# Append vibhakti if separated (only if not already present)
|
| 393 |
+
if parse.vibhakti and final_tokens:
|
| 394 |
+
last_token = final_tokens[-1].lstrip('▁')
|
| 395 |
+
# Guard: don't double-append if last token already ends with vibhakti
|
| 396 |
+
if not last_token.endswith(parse.vibhakti):
|
| 397 |
+
final_tokens.append(parse.vibhakti)
|
| 398 |
+
|
| 399 |
+
return final_tokens
|
| 400 |
+
|
| 401 |
+
def tokenize(self, text: str, **kwargs) -> List[str]:
|
| 402 |
+
"""
|
| 403 |
+
Tokenize text into morphological tokens.
|
| 404 |
+
|
| 405 |
+
This is the main entry point for tokenization.
|
| 406 |
+
"""
|
| 407 |
+
if not text:
|
| 408 |
+
return []
|
| 409 |
+
|
| 410 |
+
# Split on whitespace
|
| 411 |
+
words = text.split()
|
| 412 |
+
|
| 413 |
+
all_tokens = []
|
| 414 |
+
for i, word in enumerate(words):
|
| 415 |
+
word_tokens = self._tokenize_word(word)
|
| 416 |
+
all_tokens.extend(word_tokens)
|
| 417 |
+
|
| 418 |
+
return all_tokens
|
| 419 |
+
|
| 420 |
+
def _encode_impl(self, text: str) -> List[int]:
|
| 421 |
+
"""Internal encode implementation."""
|
| 422 |
+
tokens = self.tokenize(text)
|
| 423 |
+
return [self._convert_token_to_id(t) for t in tokens]
|
| 424 |
+
|
| 425 |
+
def encode(
|
| 426 |
+
self,
|
| 427 |
+
text: Union[str, List[str]],
|
| 428 |
+
add_special_tokens: bool = True,
|
| 429 |
+
**kwargs
|
| 430 |
+
) -> List[int]:
|
| 431 |
+
"""Encode text to token IDs."""
|
| 432 |
+
if isinstance(text, list):
|
| 433 |
+
text = " ".join(text)
|
| 434 |
+
|
| 435 |
+
ids = self._encode_impl(text)
|
| 436 |
+
|
| 437 |
+
if add_special_tokens:
|
| 438 |
+
bos_id = self._vocab.get("<s>", 1)
|
| 439 |
+
eos_id = self._vocab.get("</s>", 2)
|
| 440 |
+
ids = [bos_id] + ids + [eos_id]
|
| 441 |
+
|
| 442 |
+
return ids
|
| 443 |
+
|
| 444 |
+
def decode(
|
| 445 |
+
self,
|
| 446 |
+
token_ids: List[int],
|
| 447 |
+
skip_special_tokens: bool = True,
|
| 448 |
+
**kwargs
|
| 449 |
+
) -> str:
|
| 450 |
+
"""Decode token IDs back to text."""
|
| 451 |
+
special_ids = {0, 1, 2, 3, 4, 5, 6} # Special token IDs
|
| 452 |
+
|
| 453 |
+
tokens = []
|
| 454 |
+
for tid in token_ids:
|
| 455 |
+
if skip_special_tokens and tid in special_ids:
|
| 456 |
+
continue
|
| 457 |
+
token = self._convert_id_to_token(tid)
|
| 458 |
+
tokens.append(token)
|
| 459 |
+
|
| 460 |
+
# Join tokens, handling space prefix
|
| 461 |
+
text = ""
|
| 462 |
+
for t in tokens:
|
| 463 |
+
if t.startswith("▁"):
|
| 464 |
+
text += " " + t[1:]
|
| 465 |
+
else:
|
| 466 |
+
text += t
|
| 467 |
+
|
| 468 |
+
return text.strip()
|
| 469 |
+
|
| 470 |
+
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
| 471 |
+
"""Convert token list back to string."""
|
| 472 |
+
text = ""
|
| 473 |
+
for t in tokens:
|
| 474 |
+
if t.startswith("▁"):
|
| 475 |
+
text += " " + t[1:]
|
| 476 |
+
else:
|
| 477 |
+
text += t
|
| 478 |
+
return text.strip()
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
# --- CONVENIENCE FUNCTION ---
|
| 482 |
+
def create_tokenizer(vocab_path: Optional[str] = None) -> PaniniTokenizerV3:
|
| 483 |
+
"""Create a PaniniTokenizerV3 instance."""
|
| 484 |
+
return PaniniTokenizerV3(vocab_file=vocab_path)
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
# --- TEST ---
|
| 488 |
+
if __name__ == "__main__":
|
| 489 |
+
print("\n" + "="*60)
|
| 490 |
+
print(" Testing PaniniTokenizerV3")
|
| 491 |
+
print("="*60)
|
| 492 |
+
|
| 493 |
+
tokenizer = PaniniTokenizerV3()
|
| 494 |
+
|
| 495 |
+
test_cases = [
|
| 496 |
+
"rAmaH gacCati",
|
| 497 |
+
"hfdpadmagataM paramAtma",
|
| 498 |
+
"sopAdhikapratyagAtmAbhAsabhedAbhedavicAraH",
|
| 499 |
+
]
|
| 500 |
+
|
| 501 |
+
for text in test_cases:
|
| 502 |
+
tokens = tokenizer.tokenize(text)
|
| 503 |
+
ids = tokenizer.encode(text, add_special_tokens=False)
|
| 504 |
+
decoded = tokenizer.decode(ids)
|
| 505 |
+
|
| 506 |
+
print(f"\n Input: {text}")
|
| 507 |
+
print(f" Tokens: {tokens}")
|
| 508 |
+
print(f" IDs: {ids[:10]}..." if len(ids) > 10 else f" IDs: {ids}")
|
| 509 |
+
print(f" Decoded: {decoded}")
|
stems.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tokenizer_class": "PaniniTokenizer",
|
| 3 |
+
"auto_map": {
|
| 4 |
+
"AutoTokenizer": "tokenizer_hf.PaniniTokenizerHF"
|
| 5 |
+
},
|
| 6 |
+
"model_type": "panini_morphological",
|
| 7 |
+
"vocab_size": 128000,
|
| 8 |
+
"unk_token": "<unk>",
|
| 9 |
+
"pad_token": "<pad>",
|
| 10 |
+
"bos_token": "<bos>",
|
| 11 |
+
"eos_token": "<eos>",
|
| 12 |
+
"version": "1.0",
|
| 13 |
+
"release_name": "panini-tokenizer"
|
| 14 |
+
}
|
tokenizer_hf.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HuggingFace-compatible wrapper for PaniniTokenizer.
|
| 3 |
+
|
| 4 |
+
This file enables:
|
| 5 |
+
tokenizer = AutoTokenizer.from_pretrained("ArthaLabs/panini-tokenizer", trust_remote_code=True)
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import json
|
| 10 |
+
from typing import List, Optional, Union
|
| 11 |
+
from transformers import PreTrainedTokenizer
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class PaniniTokenizerHF(PreTrainedTokenizer):
|
| 15 |
+
"""
|
| 16 |
+
HuggingFace-compatible Panini Tokenizer.
|
| 17 |
+
|
| 18 |
+
A grammar-first Sanskrit tokenizer based on Pāṇinian morphological analysis.
|
| 19 |
+
Uses Monier-Williams dictionary stems and Sandhi reversal for tokenization.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
vocab_files_names = {"vocab_file": "vocab.json"}
|
| 23 |
+
model_input_names = ["input_ids", "attention_mask"]
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
vocab_file: Optional[str] = None,
|
| 28 |
+
unk_token: str = "<unk>",
|
| 29 |
+
pad_token: str = "<pad>",
|
| 30 |
+
bos_token: str = "<bos>",
|
| 31 |
+
eos_token: str = "<eos>",
|
| 32 |
+
**kwargs
|
| 33 |
+
):
|
| 34 |
+
# Load vocabulary
|
| 35 |
+
self._vocab = {}
|
| 36 |
+
self._id_to_token = {}
|
| 37 |
+
|
| 38 |
+
if vocab_file and os.path.exists(vocab_file):
|
| 39 |
+
with open(vocab_file, "r", encoding="utf-8") as f:
|
| 40 |
+
self._vocab = json.load(f)
|
| 41 |
+
self._id_to_token = {v: k for k, v in self._vocab.items()}
|
| 42 |
+
|
| 43 |
+
super().__init__(
|
| 44 |
+
unk_token=unk_token,
|
| 45 |
+
pad_token=pad_token,
|
| 46 |
+
bos_token=bos_token,
|
| 47 |
+
eos_token=eos_token,
|
| 48 |
+
**kwargs
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Lazy-load the morphological splitter
|
| 52 |
+
self._splitter = None
|
| 53 |
+
self._stems = None
|
| 54 |
+
|
| 55 |
+
def _load_splitter(self):
|
| 56 |
+
"""Lazy-load the morphological splitter."""
|
| 57 |
+
if self._splitter is None:
|
| 58 |
+
# Try to import from src directory
|
| 59 |
+
import sys
|
| 60 |
+
src_dir = os.path.join(os.path.dirname(__file__), "src")
|
| 61 |
+
if src_dir not in sys.path:
|
| 62 |
+
sys.path.insert(0, src_dir)
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
from splitter import SamasaSplitter
|
| 66 |
+
self._splitter = SamasaSplitter()
|
| 67 |
+
except ImportError:
|
| 68 |
+
self._splitter = None
|
| 69 |
+
|
| 70 |
+
@property
|
| 71 |
+
def vocab_size(self) -> int:
|
| 72 |
+
return len(self._vocab)
|
| 73 |
+
|
| 74 |
+
def get_vocab(self):
|
| 75 |
+
return self._vocab.copy()
|
| 76 |
+
|
| 77 |
+
def _tokenize(self, text: str) -> List[str]:
|
| 78 |
+
"""Tokenize using morphological analysis."""
|
| 79 |
+
self._load_splitter()
|
| 80 |
+
|
| 81 |
+
tokens = []
|
| 82 |
+
words = text.split()
|
| 83 |
+
|
| 84 |
+
for i, word in enumerate(words):
|
| 85 |
+
prefix = "▁" if i == 0 or not tokens else ""
|
| 86 |
+
|
| 87 |
+
if self._splitter:
|
| 88 |
+
# Use morphological splitting
|
| 89 |
+
split_result = self._splitter.split(word)
|
| 90 |
+
if split_result.is_compound and len(split_result.components) > 1:
|
| 91 |
+
for j, comp in enumerate(split_result.components):
|
| 92 |
+
if j == 0:
|
| 93 |
+
tokens.append(prefix + comp)
|
| 94 |
+
else:
|
| 95 |
+
tokens.append(comp)
|
| 96 |
+
else:
|
| 97 |
+
tokens.append(prefix + word)
|
| 98 |
+
else:
|
| 99 |
+
# Fallback: simple tokenization
|
| 100 |
+
tokens.append(prefix + word)
|
| 101 |
+
|
| 102 |
+
return tokens
|
| 103 |
+
|
| 104 |
+
def _convert_token_to_id(self, token: str) -> int:
|
| 105 |
+
return self._vocab.get(token, self._vocab.get(self.unk_token, 0))
|
| 106 |
+
|
| 107 |
+
def _convert_id_to_token(self, index: int) -> str:
|
| 108 |
+
return self._id_to_token.get(index, self.unk_token)
|
| 109 |
+
|
| 110 |
+
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
| 111 |
+
"""Convert tokens back to string."""
|
| 112 |
+
text = ""
|
| 113 |
+
for token in tokens:
|
| 114 |
+
if token.startswith("▁"):
|
| 115 |
+
text += " " + token[1:]
|
| 116 |
+
else:
|
| 117 |
+
text += token
|
| 118 |
+
return text.strip()
|
| 119 |
+
|
| 120 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
|
| 121 |
+
"""Save vocabulary to file."""
|
| 122 |
+
vocab_file = os.path.join(
|
| 123 |
+
save_directory,
|
| 124 |
+
(filename_prefix + "-" if filename_prefix else "") + "vocab.json"
|
| 125 |
+
)
|
| 126 |
+
with open(vocab_file, "w", encoding="utf-8") as f:
|
| 127 |
+
json.dump(self._vocab, f, ensure_ascii=False, indent=2)
|
| 128 |
+
return (vocab_file,)
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|