|
|
"""
|
|
|
Panini Tokenizer - Interactive Demo
|
|
|
HuggingFace Space for comparing Panini Tokenizer against SOTA models.
|
|
|
|
|
|
ArthaLabs 2025
|
|
|
"""
|
|
|
|
|
|
import gradio as gr
|
|
|
from transformers import AutoTokenizer
|
|
|
import sys
|
|
|
import os
|
|
|
|
|
|
|
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
SRC_DIR = os.path.join(BASE_DIR, "src")
|
|
|
|
|
|
|
|
|
sys.path.insert(0, SRC_DIR)
|
|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
STEMS_PATH = os.path.join(BASE_DIR, "stems.json")
|
|
|
|
|
|
|
|
|
PANINI_AVAILABLE = False
|
|
|
PANINI_SPLITTER = None
|
|
|
|
|
|
try:
|
|
|
|
|
|
import analyzer
|
|
|
analyzer.STEMS_FILE = STEMS_PATH
|
|
|
analyzer._STEM_CACHE_LOADED = False
|
|
|
|
|
|
from splitter import SamasaSplitter
|
|
|
PANINI_SPLITTER = SamasaSplitter()
|
|
|
PANINI_AVAILABLE = True
|
|
|
print(f"✅ Panini Tokenizer loaded successfully")
|
|
|
except Exception as e:
|
|
|
print(f"❌ Panini Tokenizer not available: {e}")
|
|
|
import traceback
|
|
|
traceback.print_exc()
|
|
|
|
|
|
|
|
|
TOKENIZERS = {}
|
|
|
|
|
|
def load_tokenizers():
|
|
|
"""Load all tokenizers for comparison."""
|
|
|
global TOKENIZERS
|
|
|
|
|
|
|
|
|
try:
|
|
|
TOKENIZERS["Sanskrit-BERT"] = AutoTokenizer.from_pretrained(
|
|
|
"Matej/bert-base-buddhist-sanskrit", trust_remote_code=True
|
|
|
)
|
|
|
print("✅ Sanskrit-BERT loaded")
|
|
|
except Exception as e:
|
|
|
print(f"Sanskrit-BERT failed: {e}")
|
|
|
|
|
|
|
|
|
try:
|
|
|
TOKENIZERS["MuRIL (Google)"] = AutoTokenizer.from_pretrained(
|
|
|
"google/muril-base-cased", trust_remote_code=True
|
|
|
)
|
|
|
print("✅ MuRIL loaded")
|
|
|
except Exception as e:
|
|
|
print(f"MuRIL failed: {e}")
|
|
|
|
|
|
|
|
|
try:
|
|
|
TOKENIZERS["Ansh-256k (Indic)"] = AutoTokenizer.from_pretrained(
|
|
|
"LingoIITGN/Ansh-256k", trust_remote_code=True
|
|
|
)
|
|
|
print("✅ Ansh-256k loaded")
|
|
|
except Exception as e:
|
|
|
print(f"Ansh-256k failed: {e}")
|
|
|
|
|
|
|
|
|
try:
|
|
|
TOKENIZERS["Sanskrit-Qwen2"] = AutoTokenizer.from_pretrained(
|
|
|
"diabolic6045/Sanskrit-English-qwen2-tokenizer", trust_remote_code=True
|
|
|
)
|
|
|
print("✅ Sanskrit-Qwen2 loaded")
|
|
|
except Exception as e:
|
|
|
print(f"Sanskrit-Qwen2 failed: {e}")
|
|
|
|
|
|
|
|
|
load_tokenizers()
|
|
|
|
|
|
def tokenize_with_panini(text: str) -> list:
|
|
|
"""Tokenize using Panini Tokenizer."""
|
|
|
if not PANINI_AVAILABLE or PANINI_SPLITTER is None:
|
|
|
return ["[Panini not available]"]
|
|
|
|
|
|
try:
|
|
|
tokens = []
|
|
|
words = text.split()
|
|
|
|
|
|
for i, word in enumerate(words):
|
|
|
prefix = "▁" if i == 0 else ""
|
|
|
split_result = PANINI_SPLITTER.split_v4(word)
|
|
|
|
|
|
if split_result.is_compound and len(split_result.components) > 1:
|
|
|
for j, comp in enumerate(split_result.components):
|
|
|
if j == 0:
|
|
|
tokens.append(prefix + comp)
|
|
|
else:
|
|
|
tokens.append(comp)
|
|
|
else:
|
|
|
tokens.append(prefix + word)
|
|
|
|
|
|
return tokens
|
|
|
except Exception as e:
|
|
|
return [f"[Error: {e}]"]
|
|
|
|
|
|
def tokenize_text(text: str):
|
|
|
"""Tokenize text with all tokenizers and return comparison."""
|
|
|
if not text.strip():
|
|
|
return "Please enter some Sanskrit text (SLP1 transliteration)"
|
|
|
|
|
|
results = []
|
|
|
|
|
|
|
|
|
panini_tokens = tokenize_with_panini(text)
|
|
|
results.append({
|
|
|
"name": "🏆 Panini (Ours)",
|
|
|
"count": len(panini_tokens),
|
|
|
"tokens": panini_tokens,
|
|
|
"is_panini": True
|
|
|
})
|
|
|
|
|
|
|
|
|
for name, tok in TOKENIZERS.items():
|
|
|
try:
|
|
|
tokens = tok.tokenize(text)
|
|
|
results.append({
|
|
|
"name": name,
|
|
|
"count": len(tokens),
|
|
|
"tokens": tokens,
|
|
|
"is_panini": False
|
|
|
})
|
|
|
except Exception as e:
|
|
|
results.append({
|
|
|
"name": name,
|
|
|
"count": "Error",
|
|
|
"tokens": [str(e)[:30]],
|
|
|
"is_panini": False
|
|
|
})
|
|
|
|
|
|
|
|
|
md = "## 📊 Tokenization Results\n\n"
|
|
|
|
|
|
|
|
|
panini_count = results[0]['count'] if isinstance(results[0]['count'], int) else 0
|
|
|
other_counts = [r['count'] for r in results[1:] if isinstance(r['count'], int)]
|
|
|
if other_counts and panini_count > 0:
|
|
|
avg_other = sum(other_counts) / len(other_counts)
|
|
|
compression = avg_other / panini_count
|
|
|
md += f"**Compression:** Panini uses **{compression:.1f}x fewer tokens** than average\n\n"
|
|
|
|
|
|
md += "---\n\n"
|
|
|
|
|
|
|
|
|
for r in results:
|
|
|
if r['is_panini']:
|
|
|
md += f"### {r['name']} — **{r['count']} tokens**\n"
|
|
|
else:
|
|
|
md += f"### {r['name']} — {r['count']} tokens\n"
|
|
|
|
|
|
|
|
|
tokens_str = " | ".join(r['tokens'][:10])
|
|
|
if len(tokens_str) > 80:
|
|
|
tokens_str = tokens_str[:80] + "..."
|
|
|
elif len(r['tokens']) > 10:
|
|
|
tokens_str += " ..."
|
|
|
|
|
|
md += f"```\n{tokens_str}\n```\n\n"
|
|
|
|
|
|
return md
|
|
|
|
|
|
def get_examples():
|
|
|
"""Return example inputs."""
|
|
|
return [
|
|
|
["nirapekzajYAnasAkzAtkArasAmarthyam"],
|
|
|
["tadekaniScitArthavyavasthApanam"],
|
|
|
["svaprakASatvaparaprakASavyavacCedaH"],
|
|
|
["rAmo gacCati"],
|
|
|
["dharme kzetre kurukzetre"],
|
|
|
["parasparApekzApratiyogitvanirUpaNam"],
|
|
|
]
|
|
|
|
|
|
|
|
|
with gr.Blocks(
|
|
|
title="Panini Tokenizer - ArthaLabs",
|
|
|
theme=gr.themes.Soft(),
|
|
|
css="""
|
|
|
.container { max-width: 900px; margin: auto; }
|
|
|
.title { text-align: center; }
|
|
|
"""
|
|
|
) as demo:
|
|
|
|
|
|
gr.Markdown(
|
|
|
"""
|
|
|
# 🔤 Panini Tokenizer
|
|
|
### Grammar-First Sanskrit Tokenization by ArthaLabs
|
|
|
|
|
|
Compare our morphology-based tokenizer against state-of-the-art multilingual models.
|
|
|
|
|
|
**Input Format:** SLP1 transliteration (e.g., `rAmo gacCati` not `रामो गच्छति`)
|
|
|
"""
|
|
|
)
|
|
|
|
|
|
with gr.Row():
|
|
|
with gr.Column(scale=3):
|
|
|
text_input = gr.Textbox(
|
|
|
label="Sanskrit Text (SLP1)",
|
|
|
placeholder="Enter Sanskrit text in SLP1 transliteration...",
|
|
|
lines=2,
|
|
|
value="nirapekzajYAnasAkzAtkArasAmarthyam"
|
|
|
)
|
|
|
with gr.Column(scale=1):
|
|
|
submit_btn = gr.Button("🔍 Tokenize", variant="primary", size="lg")
|
|
|
|
|
|
output = gr.Markdown(label="Results")
|
|
|
|
|
|
gr.Examples(
|
|
|
examples=get_examples(),
|
|
|
inputs=text_input,
|
|
|
label="Example Inputs (click to try)"
|
|
|
)
|
|
|
|
|
|
submit_btn.click(
|
|
|
fn=tokenize_text,
|
|
|
inputs=text_input,
|
|
|
outputs=output
|
|
|
)
|
|
|
|
|
|
text_input.submit(
|
|
|
fn=tokenize_text,
|
|
|
inputs=text_input,
|
|
|
outputs=output
|
|
|
)
|
|
|
|
|
|
gr.Markdown(
|
|
|
"""
|
|
|
---
|
|
|
### About
|
|
|
|
|
|
**Panini Tokenizer** uses recursive morphological analysis based on Pāṇinian grammar rules,
|
|
|
not statistical BPE. This results in:
|
|
|
|
|
|
- ✅ **2-4x fewer tokens** for complex compounds
|
|
|
- ✅ **Semantically meaningful** token boundaries
|
|
|
- ✅ **No arbitrary byte-level splits** like `##k`, `##z`, `##ab`
|
|
|
|
|
|
[📖 Model Card](https://huggingface.co/ArthaLabs/panini-tokenizer) |
|
|
|
[📊 Full Benchmarks](https://huggingface.co/ArthaLabs/panini-tokenizer/blob/main/BENCHMARKS.md)
|
|
|
|
|
|
---
|
|
|
*© 2025 ArthaLabs - Apache 2.0 License*
|
|
|
"""
|
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
demo.launch()
|
|
|
|