sidharthg's picture
Update app.py
6d95635 verified
"""
Gradio app for Marathi BPE Tokenizer — redesigned UI with hover tooltips and smooth animations.
Usage: python app.py
"""
from typing import Tuple, List, Dict
import re
import gradio as gr
from tokenizer import MarathiBPETokenizer # type: ignore
# Accent palette for token chips
ACCENTS = [
"#1FB6FF", # azure
"#00D4B8", # teal
"#FFB86B", # amber
"#FF6B6B", # coral
"#A78BFA", # violet
"#FFD166", # yellow
"#8ED1FC", # light sky
"#6CE0B6", # mint
]
def _token_text(tokenizer: MarathiBPETokenizer, tid: int) -> str:
"""Resolve token id to readable text."""
try:
if hasattr(tokenizer, "decode"):
out = tokenizer.decode([tid])
if out:
return out
except Exception:
pass
if isinstance(getattr(tokenizer, "id_to_token", None), dict):
return tokenizer.id_to_token.get(tid, f"<{tid}>")
vocab = getattr(tokenizer, "vocab", None)
if isinstance(vocab, dict):
if tid in vocab:
return vocab[tid]
for k, v in vocab.items():
if v == tid:
return k
return f"<{tid}>"
def tokenize_and_visualize(text: str, tokenizer: MarathiBPETokenizer) -> Tuple[str, str, str]:
"""Return (visual_html, count_card_html, token_ids_table_html)."""
if not text or not text.strip():
placeholder = (
"<div style='color:#9CA3AF; font-size:15px; padding:12px;'>"
"Enter Marathi text and click Analyze.</div>"
)
return placeholder, "<div style='color:#9CA3AF;'>Token count will appear here</div>", placeholder
try:
token_ids: List[int] = tokenizer.encode(text)
except Exception:
token_ids = []
for part in text.split():
try:
token_ids.extend(tokenizer.encode(part))
except Exception:
continue
tid_to_color: Dict[int, str] = {}
unique_tids: List[int] = []
for tid in token_ids:
if tid not in tid_to_color:
tid_to_color[tid] = ACCENTS[len(unique_tids) % len(ACCENTS)]
unique_tids.append(tid)
vis_outer = [
'<div style="position:relative; padding:18px; border-radius:12px; background:linear-gradient(180deg,#063b66 0%,#0a2b48 100%);'
'color:#F8FAFC; font-family:Inter, \'Noto Sans Devanagari\', Arial, sans-serif; font-size:18px; line-height:2;">'
]
pattern = getattr(tokenizer, "pattern", r"\S+")
chunks = re.findall(pattern, text)
token_idx = 0
token_rows = []
for chunk in chunks:
if hasattr(tokenizer, "_apply_bpe"):
try:
chunk_tids = tokenizer._apply_bpe(chunk)
except Exception:
chunk_tids = tokenizer.encode(chunk) if hasattr(tokenizer, "encode") else []
else:
try:
chunk_tids = tokenizer.encode(chunk)
except Exception:
chunk_tids = []
for tid in chunk_tids:
token_text = _token_text(tokenizer, tid)
color = tid_to_color.get(tid, ACCENTS[0])
token_rows.append((token_idx, tid, token_text, color))
# Each chip has data attributes for JS tooltip
vis_outer.append(
f'<span class="token-chip" data-idx="{token_idx}" '
f'data-tid="{tid}" data-text="{token_text}" '
f'style="background:{color}; color:#fff; padding:8px 12px; margin:6px 6px 6px 0; '
f'border-radius:10px; display:inline-block; font-weight:600; cursor:pointer; '
f'box-shadow:0 4px 12px rgba(3,12,26,0.25); text-shadow:0 1px 2px rgba(0,0,0,0.25);">'
f'{token_text}</span>'
)
token_idx += 1
vis_outer.append("</div>")
visual_html = "".join(vis_outer)
count_html = (
'<div style="padding:14px; border-radius:10px; background:linear-gradient(180deg,#f8fbff 0%,#eaf3ff 100%);'
'color:#0b2540; text-align:center; font-family:Inter, Arial, sans-serif;">'
f'<div style="font-size:28px; font-weight:700;">{len(token_ids)}</div>'
f'<div style="color:#567096; margin-top:6px;">Total tokens • {len(unique_tids)} unique</div>'
"</div>"
)
table_parts = [
'<div style="padding:12px; border-radius:10px; background:#083E8C; color:#FFFFFF; max-height:420px; overflow:auto;">',
'<table style="width:100%; border-collapse:collapse; font-family:Menlo, Monaco, monospace; font-size:13px;">',
'<thead><tr style="text-align:left;"><th style="padding:8px 10px;">Idx</th>'
'<th style="padding:8px 10px;">Token ID</th>'
'<th style="padding:8px 10px;">Token</th>'
'<th style="padding:8px 10px;">Color</th></tr></thead>',
"<tbody>"
]
for idx, tid, ttext, color in token_rows:
table_parts.append(
'<tr style="border-bottom:1px solid rgba(255,255,255,0.05);">'
f'<td style="padding:8px 10px; color:#C9D6E6;">{idx}</td>'
f'<td style="padding:8px 10px; font-weight:700; color:#FFFFFF;">{tid}</td>'
f'<td style="padding:8px 10px; color:#FFFFFF;">{ttext!r}</td>'
f'<td style="padding:8px 10px;"><span style="display:inline-block; background:{color}; '
f'padding:6px 14px; border-radius:8px; box-shadow:0 6px 14px rgba(3,12,26,0.28);"></span></td>'
"</tr>"
)
table_parts.extend(["</tbody></table></div>"])
token_ids_html = "".join(table_parts)
return visual_html, count_html, token_ids_html
def create_app(tokenizer: MarathiBPETokenizer) -> gr.Blocks:
"""Build Gradio Blocks UI with refined styling, hover animation, and tooltips."""
css = """
/* Force light theme override on Hugging Face Spaces */
html, body, .gradio-container {
background: #F8FBFF !important;
color: #0B2540 !important;
}
/* Optional: reset the dark HuggingFace container styles */
body, .main, .app, #root {
background: #F8FBFF !important;
}
:root{
--panel-bg:#0b2540;
--tile-azure:#083E8C;
--muted-text:#9CA3AF;
--header-grey:#374151;
}
body { background: linear-gradient(180deg,#061328 0%, #071627 100%); font-family:Inter, "Noto Sans Devanagari", Arial, sans-serif; }
#header { margin-bottom:14px; }
.app-title { color: var(--header-grey); font-weight:700; font-size:20px; margin:0; }
.app-sub { color: var(--muted-text); margin:4px 0 0 0; }
/* ✅ Token chip hover + tooltip */
.token-chip {
position: relative;
z-index: 1;
transition: all 0.25s ease-out;
cursor: pointer;
}
.token-chip:hover {
transform: translateY(-8px);
z-index: 100;
box-shadow: 0 24px 48px rgba(3,12,26,0.45) !important;
}
.tooltip {
position: fixed;
background: rgba(0,0,0,0.8);
color: #fff;
padding: 8px 12px;
border-radius: 8px;
font-size: 13px;
font-family: Menlo, monospace;
pointer-events: none;
opacity: 0;
transition: opacity 0.15s ease;
z-index: 9999;
max-width: 260px;
white-space: pre-wrap;
}
.gr-examples, .gr-examples td, .gr-examples th { background: transparent !important; color: #E6EEF7 !important; }
.gradio-tooltip { color:#081026 !important; background:#F3F7FB !important; }
.gr-row { gap:18px; }
.muted { color: var(--muted-text); font-size:13px; }
"""
js = """
<script>
document.addEventListener("mouseover", function(e) {
const tooltip = document.getElementById("token-tooltip");
const chip = e.target.closest(".token-chip");
if (!tooltip || !chip) return;
const idx = chip.dataset.idx;
const tid = chip.dataset.tid;
const text = chip.dataset.text;
tooltip.innerHTML = `<b>Token #${idx}</b><br>ID: ${tid}<br>Text: ${text}`;
tooltip.style.opacity = 1;
});
document.addEventListener("mousemove", function(e) {
const tooltip = document.getElementById("token-tooltip");
if (!tooltip || tooltip.style.opacity === "0") return;
tooltip.style.left = e.pageX + 12 + "px";
tooltip.style.top = e.pageY + 12 + "px";
});
document.addEventListener("mouseout", function(e) {
const chip = e.target.closest(".token-chip");
const tooltip = document.getElementById("token-tooltip");
if (tooltip && chip) {
tooltip.style.opacity = 0;
}
});
</script>
"""
with gr.Blocks(css=css) as demo:
gr.HTML('<div id="token-tooltip" class="tooltip"></div>') # Global tooltip container
gr.HTML(js) # Inject JS handlers
with gr.Row(elem_id="header"):
with gr.Column(scale=1):
gr.Markdown(
"<div><h1 class='app-title'>Marathi BPE Tokenizer</h1>"
"<div class='app-sub'>Enterprise token inspection & visualization</div></div>"
)
with gr.Row():
with gr.Column(scale=1):
input_text = gr.Textbox(
label="Input Text",
placeholder="नमस्ते, मी एक मराठी टोकनायझर आहे",
lines=6
)
analyze_btn = gr.Button("Analyze", variant="primary")
gr.Markdown("<div class='muted' style='margin-top:8px;'>Sample inputs</div>")
gr.Examples(
examples=[
["नमस्ते, मी एक मराठी टोकनायझर आहे."],
["क्रिकेट - लहान मुले बागेत क्रिकेट खेळत आहेत."],
["गाडी हळूहू चालवा किंवा आपल्याला अपघात होऊ शकतो."],
["सचिन तेंडुलकर हा आमचा अव्वल क्रिकेटपटू आहे."],
],
inputs=[input_text],
)
with gr.Column(scale=1):
visual_out = gr.HTML("<div class='muted'>Token visualization will appear here</div>")
count_out = gr.HTML("<div class='muted'>Token count will appear here</div>")
table_out = gr.HTML("<div class='muted'>Token details will appear here</div>")
def _process(text: str):
return tokenize_and_visualize(text or "", tokenizer)
analyze_btn.click(fn=_process, inputs=[input_text], outputs=[visual_out, count_out, table_out])
input_text.submit(fn=_process, inputs=[input_text], outputs=[visual_out, count_out, table_out])
return demo
def main():
tokenizer = MarathiBPETokenizer()
try:
tokenizer.load_vocab("model/vocab.json")
print("✓ Loaded vocabulary successfully")
except FileNotFoundError:
print("ERROR: Vocabulary file not found at 'model/vocab.json'")
print("Run: python train.py to train and save the tokenizer.")
return
demo = create_app(tokenizer)
demo.launch()
if __name__ == "__main__":
main()