""" Panini Tokenizer - Interactive Demo HuggingFace Space for comparing Panini Tokenizer against SOTA models. ArthaLabs 2025 """ import gradio as gr from transformers import AutoTokenizer import sys import os # Get the base directory (where app.py is located) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) SRC_DIR = os.path.join(BASE_DIR, "src") # Add src to path for Panini Tokenizer sys.path.insert(0, SRC_DIR) # Set the STEMS_FILE path BEFORE importing analyzer # This patches the module-level variable import json STEMS_PATH = os.path.join(BASE_DIR, "stems.json") # Try to import Panini Tokenizer components PANINI_AVAILABLE = False PANINI_SPLITTER = None try: # Patch the analyzer module's STEMS_FILE path import analyzer analyzer.STEMS_FILE = STEMS_PATH analyzer._STEM_CACHE_LOADED = False # Force reload with correct path from splitter import SamasaSplitter PANINI_SPLITTER = SamasaSplitter() PANINI_AVAILABLE = True print(f"✅ Panini Tokenizer loaded successfully") except Exception as e: print(f"❌ Panini Tokenizer not available: {e}") import traceback traceback.print_exc() # Load comparison tokenizers TOKENIZERS = {} def load_tokenizers(): """Load all tokenizers for comparison.""" global TOKENIZERS # Sanskrit-BERT (Buddhist Sanskrit) try: TOKENIZERS["Sanskrit-BERT"] = AutoTokenizer.from_pretrained( "Matej/bert-base-buddhist-sanskrit", trust_remote_code=True ) print("✅ Sanskrit-BERT loaded") except Exception as e: print(f"Sanskrit-BERT failed: {e}") # MuRIL (Google) try: TOKENIZERS["MuRIL (Google)"] = AutoTokenizer.from_pretrained( "google/muril-base-cased", trust_remote_code=True ) print("✅ MuRIL loaded") except Exception as e: print(f"MuRIL failed: {e}") # Ansh-256k (22 Indic Languages) try: TOKENIZERS["Ansh-256k (Indic)"] = AutoTokenizer.from_pretrained( "LingoIITGN/Ansh-256k", trust_remote_code=True ) print("✅ Ansh-256k loaded") except Exception as e: print(f"Ansh-256k failed: {e}") # Sanskrit-Qwen2 Tokenizer try: TOKENIZERS["Sanskrit-Qwen2"] = AutoTokenizer.from_pretrained( "diabolic6045/Sanskrit-English-qwen2-tokenizer", trust_remote_code=True ) print("✅ Sanskrit-Qwen2 loaded") except Exception as e: print(f"Sanskrit-Qwen2 failed: {e}") # Initialize tokenizers load_tokenizers() def tokenize_with_panini(text: str) -> list: """Tokenize using Panini Tokenizer.""" if not PANINI_AVAILABLE or PANINI_SPLITTER is None: return ["[Panini not available]"] try: tokens = [] words = text.split() for i, word in enumerate(words): prefix = "▁" if i == 0 else "" split_result = PANINI_SPLITTER.split_v4(word) # V1.5: Uses sandhi expansion if split_result.is_compound and len(split_result.components) > 1: for j, comp in enumerate(split_result.components): if j == 0: tokens.append(prefix + comp) else: tokens.append(comp) else: tokens.append(prefix + word) return tokens except Exception as e: return [f"[Error: {e}]"] def tokenize_text(text: str): """Tokenize text with all tokenizers and return comparison.""" if not text.strip(): return "Please enter some Sanskrit text (SLP1 transliteration)" results = [] # Panini Tokenizer panini_tokens = tokenize_with_panini(text) results.append({ "name": "🏆 Panini (Ours)", "count": len(panini_tokens), "tokens": panini_tokens, "is_panini": True }) # Other tokenizers for name, tok in TOKENIZERS.items(): try: tokens = tok.tokenize(text) results.append({ "name": name, "count": len(tokens), "tokens": tokens, "is_panini": False }) except Exception as e: results.append({ "name": name, "count": "Error", "tokens": [str(e)[:30]], "is_panini": False }) # Build card-style output (handles overflow better) md = "## 📊 Tokenization Results\n\n" # Summary bar panini_count = results[0]['count'] if isinstance(results[0]['count'], int) else 0 other_counts = [r['count'] for r in results[1:] if isinstance(r['count'], int)] if other_counts and panini_count > 0: avg_other = sum(other_counts) / len(other_counts) compression = avg_other / panini_count md += f"**Compression:** Panini uses **{compression:.1f}x fewer tokens** than average\n\n" md += "---\n\n" # Each tokenizer as a card for r in results: if r['is_panini']: md += f"### {r['name']} — **{r['count']} tokens**\n" else: md += f"### {r['name']} — {r['count']} tokens\n" # Truncate tokens display to ~60 chars tokens_str = " | ".join(r['tokens'][:10]) if len(tokens_str) > 80: tokens_str = tokens_str[:80] + "..." elif len(r['tokens']) > 10: tokens_str += " ..." md += f"```\n{tokens_str}\n```\n\n" return md def get_examples(): """Return example inputs.""" return [ ["nirapekzajYAnasAkzAtkArasAmarthyam"], ["tadekaniScitArthavyavasthApanam"], ["svaprakASatvaparaprakASavyavacCedaH"], ["rAmo gacCati"], ["dharme kzetre kurukzetre"], ["parasparApekzApratiyogitvanirUpaNam"], ] # Build Gradio Interface with gr.Blocks( title="Panini Tokenizer - ArthaLabs", theme=gr.themes.Soft(), css=""" .container { max-width: 900px; margin: auto; } .title { text-align: center; } """ ) as demo: gr.Markdown( """ # 🔤 Panini Tokenizer ### Grammar-First Sanskrit Tokenization by ArthaLabs Compare our morphology-based tokenizer against state-of-the-art multilingual models. **Input Format:** SLP1 transliteration (e.g., `rAmo gacCati` not `रामो गच्छति`) """ ) with gr.Row(): with gr.Column(scale=3): text_input = gr.Textbox( label="Sanskrit Text (SLP1)", placeholder="Enter Sanskrit text in SLP1 transliteration...", lines=2, value="nirapekzajYAnasAkzAtkArasAmarthyam" ) with gr.Column(scale=1): submit_btn = gr.Button("🔍 Tokenize", variant="primary", size="lg") output = gr.Markdown(label="Results") gr.Examples( examples=get_examples(), inputs=text_input, label="Example Inputs (click to try)" ) submit_btn.click( fn=tokenize_text, inputs=text_input, outputs=output ) text_input.submit( fn=tokenize_text, inputs=text_input, outputs=output ) gr.Markdown( """ --- ### About **Panini Tokenizer** uses recursive morphological analysis based on Pāṇinian grammar rules, not statistical BPE. This results in: - ✅ **2-4x fewer tokens** for complex compounds - ✅ **Semantically meaningful** token boundaries - ✅ **No arbitrary byte-level splits** like `##k`, `##z`, `##ab` [📖 Model Card](https://huggingface.co/ArthaLabs/panini-tokenizer) | [📊 Full Benchmarks](https://huggingface.co/ArthaLabs/panini-tokenizer/blob/main/BENCHMARKS.md) --- *© 2025 ArthaLabs - Apache 2.0 License* """ ) if __name__ == "__main__": demo.launch()