"""
Gradio app for Marathi BPE Tokenizer — redesigned UI with hover tooltips and smooth animations.
Usage: python app.py
"""
from typing import Tuple, List, Dict
import re
import gradio as gr
from tokenizer import MarathiBPETokenizer # type: ignore
# Accent palette for token chips
ACCENTS = [
"#1FB6FF", # azure
"#00D4B8", # teal
"#FFB86B", # amber
"#FF6B6B", # coral
"#A78BFA", # violet
"#FFD166", # yellow
"#8ED1FC", # light sky
"#6CE0B6", # mint
]
def _token_text(tokenizer: MarathiBPETokenizer, tid: int) -> str:
"""Resolve token id to readable text."""
try:
if hasattr(tokenizer, "decode"):
out = tokenizer.decode([tid])
if out:
return out
except Exception:
pass
if isinstance(getattr(tokenizer, "id_to_token", None), dict):
return tokenizer.id_to_token.get(tid, f"<{tid}>")
vocab = getattr(tokenizer, "vocab", None)
if isinstance(vocab, dict):
if tid in vocab:
return vocab[tid]
for k, v in vocab.items():
if v == tid:
return k
return f"<{tid}>"
def tokenize_and_visualize(text: str, tokenizer: MarathiBPETokenizer) -> Tuple[str, str, str]:
"""Return (visual_html, count_card_html, token_ids_table_html)."""
if not text or not text.strip():
placeholder = (
"
"
"Enter Marathi text and click Analyze.
"
)
return placeholder, "Token count will appear here
", placeholder
try:
token_ids: List[int] = tokenizer.encode(text)
except Exception:
token_ids = []
for part in text.split():
try:
token_ids.extend(tokenizer.encode(part))
except Exception:
continue
tid_to_color: Dict[int, str] = {}
unique_tids: List[int] = []
for tid in token_ids:
if tid not in tid_to_color:
tid_to_color[tid] = ACCENTS[len(unique_tids) % len(ACCENTS)]
unique_tids.append(tid)
vis_outer = [
''
]
pattern = getattr(tokenizer, "pattern", r"\S+")
chunks = re.findall(pattern, text)
token_idx = 0
token_rows = []
for chunk in chunks:
if hasattr(tokenizer, "_apply_bpe"):
try:
chunk_tids = tokenizer._apply_bpe(chunk)
except Exception:
chunk_tids = tokenizer.encode(chunk) if hasattr(tokenizer, "encode") else []
else:
try:
chunk_tids = tokenizer.encode(chunk)
except Exception:
chunk_tids = []
for tid in chunk_tids:
token_text = _token_text(tokenizer, tid)
color = tid_to_color.get(tid, ACCENTS[0])
token_rows.append((token_idx, tid, token_text, color))
# Each chip has data attributes for JS tooltip
vis_outer.append(
f''
f'{token_text}'
)
token_idx += 1
vis_outer.append("
")
visual_html = "".join(vis_outer)
count_html = (
''
f'
{len(token_ids)}
'
f'
Total tokens • {len(unique_tids)} unique
'
"
"
)
table_parts = [
'',
'
',
'| Idx | '
'Token ID | '
'Token | '
'Color |
',
""
]
for idx, tid, ttext, color in token_rows:
table_parts.append(
''
f'| {idx} | '
f'{tid} | '
f'{ttext!r} | '
f' | '
"
"
)
table_parts.extend(["
"])
token_ids_html = "".join(table_parts)
return visual_html, count_html, token_ids_html
def create_app(tokenizer: MarathiBPETokenizer) -> gr.Blocks:
"""Build Gradio Blocks UI with refined styling, hover animation, and tooltips."""
css = """
/* Force light theme override on Hugging Face Spaces */
html, body, .gradio-container {
background: #F8FBFF !important;
color: #0B2540 !important;
}
/* Optional: reset the dark HuggingFace container styles */
body, .main, .app, #root {
background: #F8FBFF !important;
}
:root{
--panel-bg:#0b2540;
--tile-azure:#083E8C;
--muted-text:#9CA3AF;
--header-grey:#374151;
}
body { background: linear-gradient(180deg,#061328 0%, #071627 100%); font-family:Inter, "Noto Sans Devanagari", Arial, sans-serif; }
#header { margin-bottom:14px; }
.app-title { color: var(--header-grey); font-weight:700; font-size:20px; margin:0; }
.app-sub { color: var(--muted-text); margin:4px 0 0 0; }
/* ✅ Token chip hover + tooltip */
.token-chip {
position: relative;
z-index: 1;
transition: all 0.25s ease-out;
cursor: pointer;
}
.token-chip:hover {
transform: translateY(-8px);
z-index: 100;
box-shadow: 0 24px 48px rgba(3,12,26,0.45) !important;
}
.tooltip {
position: fixed;
background: rgba(0,0,0,0.8);
color: #fff;
padding: 8px 12px;
border-radius: 8px;
font-size: 13px;
font-family: Menlo, monospace;
pointer-events: none;
opacity: 0;
transition: opacity 0.15s ease;
z-index: 9999;
max-width: 260px;
white-space: pre-wrap;
}
.gr-examples, .gr-examples td, .gr-examples th { background: transparent !important; color: #E6EEF7 !important; }
.gradio-tooltip { color:#081026 !important; background:#F3F7FB !important; }
.gr-row { gap:18px; }
.muted { color: var(--muted-text); font-size:13px; }
"""
js = """
"""
with gr.Blocks(css=css) as demo:
gr.HTML('') # Global tooltip container
gr.HTML(js) # Inject JS handlers
with gr.Row(elem_id="header"):
with gr.Column(scale=1):
gr.Markdown(
"Marathi BPE Tokenizer
"
"
Enterprise token inspection & visualization
"
)
with gr.Row():
with gr.Column(scale=1):
input_text = gr.Textbox(
label="Input Text",
placeholder="नमस्ते, मी एक मराठी टोकनायझर आहे",
lines=6
)
analyze_btn = gr.Button("Analyze", variant="primary")
gr.Markdown("Sample inputs
")
gr.Examples(
examples=[
["नमस्ते, मी एक मराठी टोकनायझर आहे."],
["क्रिकेट - लहान मुले बागेत क्रिकेट खेळत आहेत."],
["गाडी हळूहू चालवा किंवा आपल्याला अपघात होऊ शकतो."],
["सचिन तेंडुलकर हा आमचा अव्वल क्रिकेटपटू आहे."],
],
inputs=[input_text],
)
with gr.Column(scale=1):
visual_out = gr.HTML("Token visualization will appear here
")
count_out = gr.HTML("Token count will appear here
")
table_out = gr.HTML("Token details will appear here
")
def _process(text: str):
return tokenize_and_visualize(text or "", tokenizer)
analyze_btn.click(fn=_process, inputs=[input_text], outputs=[visual_out, count_out, table_out])
input_text.submit(fn=_process, inputs=[input_text], outputs=[visual_out, count_out, table_out])
return demo
def main():
tokenizer = MarathiBPETokenizer()
try:
tokenizer.load_vocab("model/vocab.json")
print("✓ Loaded vocabulary successfully")
except FileNotFoundError:
print("ERROR: Vocabulary file not found at 'model/vocab.json'")
print("Run: python train.py to train and save the tokenizer.")
return
demo = create_app(tokenizer)
demo.launch()
if __name__ == "__main__":
main()