ArthaLabs's picture
Upload folder using huggingface_hub
77111fb verified
"""
Panini Tokenizer - Interactive Demo
HuggingFace Space for comparing Panini Tokenizer against SOTA models.
ArthaLabs 2025
"""
import gradio as gr
from transformers import AutoTokenizer
import sys
import os
# Get the base directory (where app.py is located)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
SRC_DIR = os.path.join(BASE_DIR, "src")
# Add src to path for Panini Tokenizer
sys.path.insert(0, SRC_DIR)
# Set the STEMS_FILE path BEFORE importing analyzer
# This patches the module-level variable
import json
STEMS_PATH = os.path.join(BASE_DIR, "stems.json")
# Try to import Panini Tokenizer components
PANINI_AVAILABLE = False
PANINI_SPLITTER = None
try:
# Patch the analyzer module's STEMS_FILE path
import analyzer
analyzer.STEMS_FILE = STEMS_PATH
analyzer._STEM_CACHE_LOADED = False # Force reload with correct path
from splitter import SamasaSplitter
PANINI_SPLITTER = SamasaSplitter()
PANINI_AVAILABLE = True
print(f"✅ Panini Tokenizer loaded successfully")
except Exception as e:
print(f"❌ Panini Tokenizer not available: {e}")
import traceback
traceback.print_exc()
# Load comparison tokenizers
TOKENIZERS = {}
def load_tokenizers():
"""Load all tokenizers for comparison."""
global TOKENIZERS
# Sanskrit-BERT (Buddhist Sanskrit)
try:
TOKENIZERS["Sanskrit-BERT"] = AutoTokenizer.from_pretrained(
"Matej/bert-base-buddhist-sanskrit", trust_remote_code=True
)
print("✅ Sanskrit-BERT loaded")
except Exception as e:
print(f"Sanskrit-BERT failed: {e}")
# MuRIL (Google)
try:
TOKENIZERS["MuRIL (Google)"] = AutoTokenizer.from_pretrained(
"google/muril-base-cased", trust_remote_code=True
)
print("✅ MuRIL loaded")
except Exception as e:
print(f"MuRIL failed: {e}")
# Ansh-256k (22 Indic Languages)
try:
TOKENIZERS["Ansh-256k (Indic)"] = AutoTokenizer.from_pretrained(
"LingoIITGN/Ansh-256k", trust_remote_code=True
)
print("✅ Ansh-256k loaded")
except Exception as e:
print(f"Ansh-256k failed: {e}")
# Sanskrit-Qwen2 Tokenizer
try:
TOKENIZERS["Sanskrit-Qwen2"] = AutoTokenizer.from_pretrained(
"diabolic6045/Sanskrit-English-qwen2-tokenizer", trust_remote_code=True
)
print("✅ Sanskrit-Qwen2 loaded")
except Exception as e:
print(f"Sanskrit-Qwen2 failed: {e}")
# Initialize tokenizers
load_tokenizers()
def tokenize_with_panini(text: str) -> list:
"""Tokenize using Panini Tokenizer."""
if not PANINI_AVAILABLE or PANINI_SPLITTER is None:
return ["[Panini not available]"]
try:
tokens = []
words = text.split()
for i, word in enumerate(words):
prefix = "▁" if i == 0 else ""
split_result = PANINI_SPLITTER.split_v4(word) # V1.5: Uses sandhi expansion
if split_result.is_compound and len(split_result.components) > 1:
for j, comp in enumerate(split_result.components):
if j == 0:
tokens.append(prefix + comp)
else:
tokens.append(comp)
else:
tokens.append(prefix + word)
return tokens
except Exception as e:
return [f"[Error: {e}]"]
def tokenize_text(text: str):
"""Tokenize text with all tokenizers and return comparison."""
if not text.strip():
return "Please enter some Sanskrit text (SLP1 transliteration)"
results = []
# Panini Tokenizer
panini_tokens = tokenize_with_panini(text)
results.append({
"name": "🏆 Panini (Ours)",
"count": len(panini_tokens),
"tokens": panini_tokens,
"is_panini": True
})
# Other tokenizers
for name, tok in TOKENIZERS.items():
try:
tokens = tok.tokenize(text)
results.append({
"name": name,
"count": len(tokens),
"tokens": tokens,
"is_panini": False
})
except Exception as e:
results.append({
"name": name,
"count": "Error",
"tokens": [str(e)[:30]],
"is_panini": False
})
# Build card-style output (handles overflow better)
md = "## 📊 Tokenization Results\n\n"
# Summary bar
panini_count = results[0]['count'] if isinstance(results[0]['count'], int) else 0
other_counts = [r['count'] for r in results[1:] if isinstance(r['count'], int)]
if other_counts and panini_count > 0:
avg_other = sum(other_counts) / len(other_counts)
compression = avg_other / panini_count
md += f"**Compression:** Panini uses **{compression:.1f}x fewer tokens** than average\n\n"
md += "---\n\n"
# Each tokenizer as a card
for r in results:
if r['is_panini']:
md += f"### {r['name']} — **{r['count']} tokens**\n"
else:
md += f"### {r['name']}{r['count']} tokens\n"
# Truncate tokens display to ~60 chars
tokens_str = " | ".join(r['tokens'][:10])
if len(tokens_str) > 80:
tokens_str = tokens_str[:80] + "..."
elif len(r['tokens']) > 10:
tokens_str += " ..."
md += f"```\n{tokens_str}\n```\n\n"
return md
def get_examples():
"""Return example inputs."""
return [
["nirapekzajYAnasAkzAtkArasAmarthyam"],
["tadekaniScitArthavyavasthApanam"],
["svaprakASatvaparaprakASavyavacCedaH"],
["rAmo gacCati"],
["dharme kzetre kurukzetre"],
["parasparApekzApratiyogitvanirUpaNam"],
]
# Build Gradio Interface
with gr.Blocks(
title="Panini Tokenizer - ArthaLabs",
theme=gr.themes.Soft(),
css="""
.container { max-width: 900px; margin: auto; }
.title { text-align: center; }
"""
) as demo:
gr.Markdown(
"""
# 🔤 Panini Tokenizer
### Grammar-First Sanskrit Tokenization by ArthaLabs
Compare our morphology-based tokenizer against state-of-the-art multilingual models.
**Input Format:** SLP1 transliteration (e.g., `rAmo gacCati` not `रामो गच्छति`)
"""
)
with gr.Row():
with gr.Column(scale=3):
text_input = gr.Textbox(
label="Sanskrit Text (SLP1)",
placeholder="Enter Sanskrit text in SLP1 transliteration...",
lines=2,
value="nirapekzajYAnasAkzAtkArasAmarthyam"
)
with gr.Column(scale=1):
submit_btn = gr.Button("🔍 Tokenize", variant="primary", size="lg")
output = gr.Markdown(label="Results")
gr.Examples(
examples=get_examples(),
inputs=text_input,
label="Example Inputs (click to try)"
)
submit_btn.click(
fn=tokenize_text,
inputs=text_input,
outputs=output
)
text_input.submit(
fn=tokenize_text,
inputs=text_input,
outputs=output
)
gr.Markdown(
"""
---
### About
**Panini Tokenizer** uses recursive morphological analysis based on Pāṇinian grammar rules,
not statistical BPE. This results in:
- ✅ **2-4x fewer tokens** for complex compounds
- ✅ **Semantically meaningful** token boundaries
- ✅ **No arbitrary byte-level splits** like `##k`, `##z`, `##ab`
[📖 Model Card](https://huggingface.co/ArthaLabs/panini-tokenizer) |
[📊 Full Benchmarks](https://huggingface.co/ArthaLabs/panini-tokenizer/blob/main/BENCHMARKS.md)
---
*© 2025 ArthaLabs - Apache 2.0 License*
"""
)
if __name__ == "__main__":
demo.launch()