""" Gradio app for Marathi BPE Tokenizer — redesigned UI with hover tooltips and smooth animations. Usage: python app.py """ from typing import Tuple, List, Dict import re import gradio as gr from tokenizer import MarathiBPETokenizer # type: ignore # Accent palette for token chips ACCENTS = [ "#1FB6FF", # azure "#00D4B8", # teal "#FFB86B", # amber "#FF6B6B", # coral "#A78BFA", # violet "#FFD166", # yellow "#8ED1FC", # light sky "#6CE0B6", # mint ] def _token_text(tokenizer: MarathiBPETokenizer, tid: int) -> str: """Resolve token id to readable text.""" try: if hasattr(tokenizer, "decode"): out = tokenizer.decode([tid]) if out: return out except Exception: pass if isinstance(getattr(tokenizer, "id_to_token", None), dict): return tokenizer.id_to_token.get(tid, f"<{tid}>") vocab = getattr(tokenizer, "vocab", None) if isinstance(vocab, dict): if tid in vocab: return vocab[tid] for k, v in vocab.items(): if v == tid: return k return f"<{tid}>" def tokenize_and_visualize(text: str, tokenizer: MarathiBPETokenizer) -> Tuple[str, str, str]: """Return (visual_html, count_card_html, token_ids_table_html).""" if not text or not text.strip(): placeholder = ( "
" "Enter Marathi text and click Analyze.
" ) return placeholder, "
Token count will appear here
", placeholder try: token_ids: List[int] = tokenizer.encode(text) except Exception: token_ids = [] for part in text.split(): try: token_ids.extend(tokenizer.encode(part)) except Exception: continue tid_to_color: Dict[int, str] = {} unique_tids: List[int] = [] for tid in token_ids: if tid not in tid_to_color: tid_to_color[tid] = ACCENTS[len(unique_tids) % len(ACCENTS)] unique_tids.append(tid) vis_outer = [ '
' ] pattern = getattr(tokenizer, "pattern", r"\S+") chunks = re.findall(pattern, text) token_idx = 0 token_rows = [] for chunk in chunks: if hasattr(tokenizer, "_apply_bpe"): try: chunk_tids = tokenizer._apply_bpe(chunk) except Exception: chunk_tids = tokenizer.encode(chunk) if hasattr(tokenizer, "encode") else [] else: try: chunk_tids = tokenizer.encode(chunk) except Exception: chunk_tids = [] for tid in chunk_tids: token_text = _token_text(tokenizer, tid) color = tid_to_color.get(tid, ACCENTS[0]) token_rows.append((token_idx, tid, token_text, color)) # Each chip has data attributes for JS tooltip vis_outer.append( f'' f'{token_text}' ) token_idx += 1 vis_outer.append("
") visual_html = "".join(vis_outer) count_html = ( '
' f'
{len(token_ids)}
' f'
Total tokens • {len(unique_tids)} unique
' "
" ) table_parts = [ '
', '', '' '' '' '', "" ] for idx, tid, ttext, color in token_rows: table_parts.append( '' f'' f'' f'' f'' "" ) table_parts.extend(["
IdxToken IDTokenColor
{idx}{tid}{ttext!r}
"]) token_ids_html = "".join(table_parts) return visual_html, count_html, token_ids_html def create_app(tokenizer: MarathiBPETokenizer) -> gr.Blocks: """Build Gradio Blocks UI with refined styling, hover animation, and tooltips.""" css = """ /* Force light theme override on Hugging Face Spaces */ html, body, .gradio-container { background: #F8FBFF !important; color: #0B2540 !important; } /* Optional: reset the dark HuggingFace container styles */ body, .main, .app, #root { background: #F8FBFF !important; } :root{ --panel-bg:#0b2540; --tile-azure:#083E8C; --muted-text:#9CA3AF; --header-grey:#374151; } body { background: linear-gradient(180deg,#061328 0%, #071627 100%); font-family:Inter, "Noto Sans Devanagari", Arial, sans-serif; } #header { margin-bottom:14px; } .app-title { color: var(--header-grey); font-weight:700; font-size:20px; margin:0; } .app-sub { color: var(--muted-text); margin:4px 0 0 0; } /* ✅ Token chip hover + tooltip */ .token-chip { position: relative; z-index: 1; transition: all 0.25s ease-out; cursor: pointer; } .token-chip:hover { transform: translateY(-8px); z-index: 100; box-shadow: 0 24px 48px rgba(3,12,26,0.45) !important; } .tooltip { position: fixed; background: rgba(0,0,0,0.8); color: #fff; padding: 8px 12px; border-radius: 8px; font-size: 13px; font-family: Menlo, monospace; pointer-events: none; opacity: 0; transition: opacity 0.15s ease; z-index: 9999; max-width: 260px; white-space: pre-wrap; } .gr-examples, .gr-examples td, .gr-examples th { background: transparent !important; color: #E6EEF7 !important; } .gradio-tooltip { color:#081026 !important; background:#F3F7FB !important; } .gr-row { gap:18px; } .muted { color: var(--muted-text); font-size:13px; } """ js = """ """ with gr.Blocks(css=css) as demo: gr.HTML('
') # Global tooltip container gr.HTML(js) # Inject JS handlers with gr.Row(elem_id="header"): with gr.Column(scale=1): gr.Markdown( "

Marathi BPE Tokenizer

" "
Enterprise token inspection & visualization
" ) with gr.Row(): with gr.Column(scale=1): input_text = gr.Textbox( label="Input Text", placeholder="नमस्ते, मी एक मराठी टोकनायझर आहे", lines=6 ) analyze_btn = gr.Button("Analyze", variant="primary") gr.Markdown("
Sample inputs
") gr.Examples( examples=[ ["नमस्ते, मी एक मराठी टोकनायझर आहे."], ["क्रिकेट - लहान मुले बागेत क्रिकेट खेळत आहेत."], ["गाडी हळूहू चालवा किंवा आपल्याला अपघात होऊ शकतो."], ["सचिन तेंडुलकर हा आमचा अव्वल क्रिकेटपटू आहे."], ], inputs=[input_text], ) with gr.Column(scale=1): visual_out = gr.HTML("
Token visualization will appear here
") count_out = gr.HTML("
Token count will appear here
") table_out = gr.HTML("
Token details will appear here
") def _process(text: str): return tokenize_and_visualize(text or "", tokenizer) analyze_btn.click(fn=_process, inputs=[input_text], outputs=[visual_out, count_out, table_out]) input_text.submit(fn=_process, inputs=[input_text], outputs=[visual_out, count_out, table_out]) return demo def main(): tokenizer = MarathiBPETokenizer() try: tokenizer.load_vocab("model/vocab.json") print("✓ Loaded vocabulary successfully") except FileNotFoundError: print("ERROR: Vocabulary file not found at 'model/vocab.json'") print("Run: python train.py to train and save the tokenizer.") return demo = create_app(tokenizer) demo.launch() if __name__ == "__main__": main()