File size: 8,315 Bytes

"""

Panini Tokenizer - Interactive Demo

HuggingFace Space for comparing Panini Tokenizer against SOTA models.



ArthaLabs 2025

"""

import gradio as gr
from transformers import AutoTokenizer
import sys
import os

# Get the base directory (where app.py is located)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
SRC_DIR = os.path.join(BASE_DIR, "src")

# Add src to path for Panini Tokenizer
sys.path.insert(0, SRC_DIR)

# Set the STEMS_FILE path BEFORE importing analyzer
# This patches the module-level variable
import json
STEMS_PATH = os.path.join(BASE_DIR, "stems.json")

# Try to import Panini Tokenizer components
PANINI_AVAILABLE = False
PANINI_SPLITTER = None

try:
    # Patch the analyzer module's STEMS_FILE path
    import analyzer
    analyzer.STEMS_FILE = STEMS_PATH
    analyzer._STEM_CACHE_LOADED = False  # Force reload with correct path
    
    from splitter import SamasaSplitter
    PANINI_SPLITTER = SamasaSplitter()
    PANINI_AVAILABLE = True
    print(f"✅ Panini Tokenizer loaded successfully")
except Exception as e:
    print(f"❌ Panini Tokenizer not available: {e}")
    import traceback
    traceback.print_exc()

# Load comparison tokenizers
TOKENIZERS = {}

def load_tokenizers():
    """Load all tokenizers for comparison."""
    global TOKENIZERS
    
    # Sanskrit-BERT (Buddhist Sanskrit)
    try:
        TOKENIZERS["Sanskrit-BERT"] = AutoTokenizer.from_pretrained(
            "Matej/bert-base-buddhist-sanskrit", trust_remote_code=True
        )
        print("✅ Sanskrit-BERT loaded")
    except Exception as e:
        print(f"Sanskrit-BERT failed: {e}")
    
    # MuRIL (Google)
    try:
        TOKENIZERS["MuRIL (Google)"] = AutoTokenizer.from_pretrained(
            "google/muril-base-cased", trust_remote_code=True
        )
        print("✅ MuRIL loaded")
    except Exception as e:
        print(f"MuRIL failed: {e}")
    
    # Ansh-256k (22 Indic Languages)
    try:
        TOKENIZERS["Ansh-256k (Indic)"] = AutoTokenizer.from_pretrained(
            "LingoIITGN/Ansh-256k", trust_remote_code=True
        )
        print("✅ Ansh-256k loaded")
    except Exception as e:
        print(f"Ansh-256k failed: {e}")
    
    # Sanskrit-Qwen2 Tokenizer
    try:
        TOKENIZERS["Sanskrit-Qwen2"] = AutoTokenizer.from_pretrained(
            "diabolic6045/Sanskrit-English-qwen2-tokenizer", trust_remote_code=True
        )
        print("✅ Sanskrit-Qwen2 loaded")
    except Exception as e:
        print(f"Sanskrit-Qwen2 failed: {e}")

# Initialize tokenizers
load_tokenizers()

def tokenize_with_panini(text: str) -> list:
    """Tokenize using Panini Tokenizer."""
    if not PANINI_AVAILABLE or PANINI_SPLITTER is None:
        return ["[Panini not available]"]
    
    try:
        tokens = []
        words = text.split()
        
        for i, word in enumerate(words):
            prefix = "▁" if i == 0 else ""
            split_result = PANINI_SPLITTER.split_v4(word)  # V1.5: Uses sandhi expansion
            
            if split_result.is_compound and len(split_result.components) > 1:
                for j, comp in enumerate(split_result.components):
                    if j == 0:
                        tokens.append(prefix + comp)
                    else:
                        tokens.append(comp)
            else:
                tokens.append(prefix + word)
        
        return tokens
    except Exception as e:
        return [f"[Error: {e}]"]

def tokenize_text(text: str):
    """Tokenize text with all tokenizers and return comparison."""
    if not text.strip():
        return "Please enter some Sanskrit text (SLP1 transliteration)"
    
    results = []
    
    # Panini Tokenizer
    panini_tokens = tokenize_with_panini(text)
    results.append({
        "name": "🏆 Panini (Ours)",
        "count": len(panini_tokens),
        "tokens": panini_tokens,
        "is_panini": True
    })
    
    # Other tokenizers
    for name, tok in TOKENIZERS.items():
        try:
            tokens = tok.tokenize(text)
            results.append({
                "name": name,
                "count": len(tokens),
                "tokens": tokens,
                "is_panini": False
            })
        except Exception as e:
            results.append({
                "name": name,
                "count": "Error",
                "tokens": [str(e)[:30]],
                "is_panini": False
            })
    
    # Build card-style output (handles overflow better)
    md = "## 📊 Tokenization Results\n\n"
    
    # Summary bar
    panini_count = results[0]['count'] if isinstance(results[0]['count'], int) else 0
    other_counts = [r['count'] for r in results[1:] if isinstance(r['count'], int)]
    if other_counts and panini_count > 0:
        avg_other = sum(other_counts) / len(other_counts)
        compression = avg_other / panini_count
        md += f"**Compression:** Panini uses **{compression:.1f}x fewer tokens** than average\n\n"
    
    md += "---\n\n"
    
    # Each tokenizer as a card
    for r in results:
        if r['is_panini']:
            md += f"### {r['name']} — **{r['count']} tokens**\n"
        else:
            md += f"### {r['name']} — {r['count']} tokens\n"
        
        # Truncate tokens display to ~60 chars
        tokens_str = " | ".join(r['tokens'][:10])
        if len(tokens_str) > 80:
            tokens_str = tokens_str[:80] + "..."
        elif len(r['tokens']) > 10:
            tokens_str += " ..."
        
        md += f"```\n{tokens_str}\n```\n\n"
    
    return md

def get_examples():
    """Return example inputs."""
    return [
        ["nirapekzajYAnasAkzAtkArasAmarthyam"],
        ["tadekaniScitArthavyavasthApanam"],
        ["svaprakASatvaparaprakASavyavacCedaH"],
        ["rAmo gacCati"],
        ["dharme kzetre kurukzetre"],
        ["parasparApekzApratiyogitvanirUpaNam"],
    ]

# Build Gradio Interface
with gr.Blocks(
    title="Panini Tokenizer - ArthaLabs",
    theme=gr.themes.Soft(),
    css="""

    .container { max-width: 900px; margin: auto; }

    .title { text-align: center; }

    """
) as demo:
    
    gr.Markdown(
        """

        # 🔤 Panini Tokenizer

        ### Grammar-First Sanskrit Tokenization by ArthaLabs

        

        Compare our morphology-based tokenizer against state-of-the-art multilingual models.

        

        **Input Format:** SLP1 transliteration (e.g., `rAmo gacCati` not `रामो गच्छति`)

        """
    )
    
    with gr.Row():
        with gr.Column(scale=3):
            text_input = gr.Textbox(
                label="Sanskrit Text (SLP1)",
                placeholder="Enter Sanskrit text in SLP1 transliteration...",
                lines=2,
                value="nirapekzajYAnasAkzAtkArasAmarthyam"
            )
        with gr.Column(scale=1):
            submit_btn = gr.Button("🔍 Tokenize", variant="primary", size="lg")
    
    output = gr.Markdown(label="Results")
    
    gr.Examples(
        examples=get_examples(),
        inputs=text_input,
        label="Example Inputs (click to try)"
    )
    
    submit_btn.click(
        fn=tokenize_text,
        inputs=text_input,
        outputs=output
    )
    
    text_input.submit(
        fn=tokenize_text,
        inputs=text_input,
        outputs=output
    )
    
    gr.Markdown(
        """

        ---

        ### About

        

        **Panini Tokenizer** uses recursive morphological analysis based on Pāṇinian grammar rules,

        not statistical BPE. This results in:

        

        - ✅ **2-4x fewer tokens** for complex compounds

        - ✅ **Semantically meaningful** token boundaries

        - ✅ **No arbitrary byte-level splits** like `##k`, `##z`, `##ab`

        

        [📖 Model Card](https://huggingface.co/ArthaLabs/panini-tokenizer) | 

        [📊 Full Benchmarks](https://huggingface.co/ArthaLabs/panini-tokenizer/blob/main/BENCHMARKS.md)

        

        ---

        *© 2025 ArthaLabs - Apache 2.0 License*

        """
    )

if __name__ == "__main__":
    demo.launch()