Spaces:

sidharthg
/

marathi-bpe-tokenizer

Sleeping

File size: 11,303 Bytes

"""
Gradio app for Marathi BPE Tokenizer — redesigned UI with hover tooltips and smooth animations.
Usage: python app.py
"""

from typing import Tuple, List, Dict
import re
import gradio as gr

from tokenizer import MarathiBPETokenizer  # type: ignore

# Accent palette for token chips
ACCENTS = [
    "#1FB6FF",  # azure
    "#00D4B8",  # teal
    "#FFB86B",  # amber
    "#FF6B6B",  # coral
    "#A78BFA",  # violet
    "#FFD166",  # yellow
    "#8ED1FC",  # light sky
    "#6CE0B6",  # mint
]


def _token_text(tokenizer: MarathiBPETokenizer, tid: int) -> str:
    """Resolve token id to readable text."""
    try:
        if hasattr(tokenizer, "decode"):
            out = tokenizer.decode([tid])
            if out:
                return out
    except Exception:
        pass

    if isinstance(getattr(tokenizer, "id_to_token", None), dict):
        return tokenizer.id_to_token.get(tid, f"<{tid}>")

    vocab = getattr(tokenizer, "vocab", None)
    if isinstance(vocab, dict):
        if tid in vocab:
            return vocab[tid]
        for k, v in vocab.items():
            if v == tid:
                return k

    return f"<{tid}>"


def tokenize_and_visualize(text: str, tokenizer: MarathiBPETokenizer) -> Tuple[str, str, str]:
    """Return (visual_html, count_card_html, token_ids_table_html)."""
    if not text or not text.strip():
        placeholder = (
            "<div style='color:#9CA3AF; font-size:15px; padding:12px;'>"
            "Enter Marathi text and click Analyze.</div>"
        )
        return placeholder, "<div style='color:#9CA3AF;'>Token count will appear here</div>", placeholder

    try:
        token_ids: List[int] = tokenizer.encode(text)
    except Exception:
        token_ids = []
        for part in text.split():
            try:
                token_ids.extend(tokenizer.encode(part))
            except Exception:
                continue

    tid_to_color: Dict[int, str] = {}
    unique_tids: List[int] = []
    for tid in token_ids:
        if tid not in tid_to_color:
            tid_to_color[tid] = ACCENTS[len(unique_tids) % len(ACCENTS)]
            unique_tids.append(tid)

    vis_outer = [
        '<div style="position:relative; padding:18px; border-radius:12px; background:linear-gradient(180deg,#063b66 0%,#0a2b48 100%);'
        'color:#F8FAFC; font-family:Inter, \'Noto Sans Devanagari\', Arial, sans-serif; font-size:18px; line-height:2;">'
    ]

    pattern = getattr(tokenizer, "pattern", r"\S+")
    chunks = re.findall(pattern, text)
    token_idx = 0
    token_rows = []

    for chunk in chunks:
        if hasattr(tokenizer, "_apply_bpe"):
            try:
                chunk_tids = tokenizer._apply_bpe(chunk)
            except Exception:
                chunk_tids = tokenizer.encode(chunk) if hasattr(tokenizer, "encode") else []
        else:
            try:
                chunk_tids = tokenizer.encode(chunk)
            except Exception:
                chunk_tids = []

        for tid in chunk_tids:
            token_text = _token_text(tokenizer, tid)
            color = tid_to_color.get(tid, ACCENTS[0])
            token_rows.append((token_idx, tid, token_text, color))

            # Each chip has data attributes for JS tooltip
            vis_outer.append(
                f'<span class="token-chip" data-idx="{token_idx}" '
                f'data-tid="{tid}" data-text="{token_text}" '
                f'style="background:{color}; color:#fff; padding:8px 12px; margin:6px 6px 6px 0; '
                f'border-radius:10px; display:inline-block; font-weight:600; cursor:pointer; '
                f'box-shadow:0 4px 12px rgba(3,12,26,0.25); text-shadow:0 1px 2px rgba(0,0,0,0.25);">'
                f'{token_text}</span>'
            )
            token_idx += 1

    vis_outer.append("</div>")
    visual_html = "".join(vis_outer)

    count_html = (
        '<div style="padding:14px; border-radius:10px; background:linear-gradient(180deg,#f8fbff 0%,#eaf3ff 100%);'
        'color:#0b2540; text-align:center; font-family:Inter, Arial, sans-serif;">'
        f'<div style="font-size:28px; font-weight:700;">{len(token_ids)}</div>'
        f'<div style="color:#567096; margin-top:6px;">Total tokens • {len(unique_tids)} unique</div>'
        "</div>"
    )

    table_parts = [
        '<div style="padding:12px; border-radius:10px; background:#083E8C; color:#FFFFFF; max-height:420px; overflow:auto;">',
        '<table style="width:100%; border-collapse:collapse; font-family:Menlo, Monaco, monospace; font-size:13px;">',
        '<thead><tr style="text-align:left;"><th style="padding:8px 10px;">Idx</th>'
        '<th style="padding:8px 10px;">Token ID</th>'
        '<th style="padding:8px 10px;">Token</th>'
        '<th style="padding:8px 10px;">Color</th></tr></thead>',
        "<tbody>"
    ]

    for idx, tid, ttext, color in token_rows:
        table_parts.append(
            '<tr style="border-bottom:1px solid rgba(255,255,255,0.05);">'
            f'<td style="padding:8px 10px; color:#C9D6E6;">{idx}</td>'
            f'<td style="padding:8px 10px; font-weight:700; color:#FFFFFF;">{tid}</td>'
            f'<td style="padding:8px 10px; color:#FFFFFF;">{ttext!r}</td>'
            f'<td style="padding:8px 10px;"><span style="display:inline-block; background:{color}; '
            f'padding:6px 14px; border-radius:8px; box-shadow:0 6px 14px rgba(3,12,26,0.28);"></span></td>'
            "</tr>"
        )

    table_parts.extend(["</tbody></table></div>"])
    token_ids_html = "".join(table_parts)

    return visual_html, count_html, token_ids_html


def create_app(tokenizer: MarathiBPETokenizer) -> gr.Blocks:
    """Build Gradio Blocks UI with refined styling, hover animation, and tooltips."""
    css = """
    /* Force light theme override on Hugging Face Spaces */
    html, body, .gradio-container {
        background: #F8FBFF !important;
        color: #0B2540 !important;
    }

    /* Optional: reset the dark HuggingFace container styles */
    body, .main, .app, #root {
        background: #F8FBFF !important;
    }

    :root{
      --panel-bg:#0b2540;
      --tile-azure:#083E8C;
      --muted-text:#9CA3AF;
      --header-grey:#374151;
    }

    body { background: linear-gradient(180deg,#061328 0%, #071627 100%); font-family:Inter, "Noto Sans Devanagari", Arial, sans-serif; }

    #header { margin-bottom:14px; }
    .app-title { color: var(--header-grey); font-weight:700; font-size:20px; margin:0; }
    .app-sub { color: var(--muted-text); margin:4px 0 0 0; }

    /* ✅ Token chip hover + tooltip */
    .token-chip { 
      position: relative;
      z-index: 1;
      transition: all 0.25s ease-out;
      cursor: pointer;
    }
    .token-chip:hover { 
      transform: translateY(-8px);
      z-index: 100;
      box-shadow: 0 24px 48px rgba(3,12,26,0.45) !important;
    }

    .tooltip {
      position: fixed;
      background: rgba(0,0,0,0.8);
      color: #fff;
      padding: 8px 12px;
      border-radius: 8px;
      font-size: 13px;
      font-family: Menlo, monospace;
      pointer-events: none;
      opacity: 0;
      transition: opacity 0.15s ease;
      z-index: 9999;
      max-width: 260px;
      white-space: pre-wrap;
    }

    .gr-examples, .gr-examples td, .gr-examples th { background: transparent !important; color: #E6EEF7 !important; }
    .gradio-tooltip { color:#081026 !important; background:#F3F7FB !important; }
    .gr-row { gap:18px; }
    .muted { color: var(--muted-text); font-size:13px; }
    """

    js = """
    <script>
    document.addEventListener("mouseover", function(e) {
        const tooltip = document.getElementById("token-tooltip");
        const chip = e.target.closest(".token-chip");
        if (!tooltip || !chip) return;
        const idx = chip.dataset.idx;
        const tid = chip.dataset.tid;
        const text = chip.dataset.text;
        tooltip.innerHTML = `<b>Token #${idx}</b><br>ID: ${tid}<br>Text: ${text}`;
        tooltip.style.opacity = 1;
    });

    document.addEventListener("mousemove", function(e) {
        const tooltip = document.getElementById("token-tooltip");
        if (!tooltip || tooltip.style.opacity === "0") return;
        tooltip.style.left = e.pageX + 12 + "px";
        tooltip.style.top = e.pageY + 12 + "px";
    });

    document.addEventListener("mouseout", function(e) {
        const chip = e.target.closest(".token-chip");
        const tooltip = document.getElementById("token-tooltip");
        if (tooltip && chip) {
            tooltip.style.opacity = 0;
        }
    });
    </script>
    """

    with gr.Blocks(css=css) as demo:
        gr.HTML('<div id="token-tooltip" class="tooltip"></div>')  # Global tooltip container
        gr.HTML(js)  # Inject JS handlers

        with gr.Row(elem_id="header"):
            with gr.Column(scale=1):
                gr.Markdown(
                    "<div><h1 class='app-title'>Marathi BPE Tokenizer</h1>"
                    "<div class='app-sub'>Enterprise token inspection & visualization</div></div>"
                )

        with gr.Row():
            with gr.Column(scale=1):
                input_text = gr.Textbox(
                    label="Input Text",
                    placeholder="नमस्ते, मी एक मराठी टोकनायझर आहे",
                    lines=6
                )
                analyze_btn = gr.Button("Analyze", variant="primary")
                gr.Markdown("<div class='muted' style='margin-top:8px;'>Sample inputs</div>")
                gr.Examples(
                    examples=[
                        ["नमस्ते, मी एक मराठी टोकनायझर आहे."],
                        ["क्रिकेट - लहान मुले बागेत क्रिकेट खेळत आहेत."],
                        ["गाडी हळूहू चालवा किंवा आपल्याला अपघात होऊ शकतो."],
                        ["सचिन तेंडुलकर हा आमचा अव्वल क्रिकेटपटू आहे."],
                    ],
                    inputs=[input_text],
                )

            with gr.Column(scale=1):
                visual_out = gr.HTML("<div class='muted'>Token visualization will appear here</div>")
                count_out = gr.HTML("<div class='muted'>Token count will appear here</div>")
                table_out = gr.HTML("<div class='muted'>Token details will appear here</div>")

        def _process(text: str):
            return tokenize_and_visualize(text or "", tokenizer)

        analyze_btn.click(fn=_process, inputs=[input_text], outputs=[visual_out, count_out, table_out])
        input_text.submit(fn=_process, inputs=[input_text], outputs=[visual_out, count_out, table_out])

    return demo


def main():
    tokenizer = MarathiBPETokenizer()
    try:
        tokenizer.load_vocab("model/vocab.json")
        print("✓ Loaded vocabulary successfully")
    except FileNotFoundError:
        print("ERROR: Vocabulary file not found at 'model/vocab.json'")
        print("Run: python train.py to train and save the tokenizer.")
        return

    demo = create_app(tokenizer)
    demo.launch()


if __name__ == "__main__":
    main()