Spaces:
Sleeping
Sleeping
| import sys | |
| from pathlib import Path | |
| import html | |
| import gradio as gr | |
| from transformers import PreTrainedTokenizerFast | |
| def load_tokenizer(): | |
| tok_path = Path("kn_bpe_8000.json") | |
| if not tok_path.exists(): | |
| return None, f"Tokenizer not found at {tok_path}. Run build_kn_bpe.py first." | |
| tok = PreTrainedTokenizerFast( | |
| tokenizer_file=str(tok_path), | |
| unk_token="[UNK]", | |
| pad_token="[PAD]", | |
| bos_token="[BOS]", | |
| eos_token="[EOS]", | |
| sep_token="[SEP]", | |
| mask_token="[MASK]", | |
| ) | |
| return tok, None | |
| tokenizer, load_err = load_tokenizer() | |
| def _colored_tokens_html(tokens): | |
| if not tokens: | |
| return "" | |
| palette = [ | |
| "#e57373", "#64b5f6", "#81c784", "#ffd54f", "#ba68c8", | |
| "#4db6ac", "#ff8a65", "#9575cd", "#4fc3f7", "#aed581", | |
| ] | |
| spans = [] | |
| for i, t in enumerate(tokens): | |
| color = palette[i % len(palette)] | |
| safe = html.escape(t) | |
| spans.append( | |
| f'<span style="background:{color};padding:2px 4px;border-radius:4px;margin-right:2px;display:inline-block">{safe}</span>' | |
| ) | |
| return "".join(spans) | |
| def tokenize_single(text: str): | |
| if load_err: | |
| return "", "", "", "", "", load_err | |
| if not text: | |
| return [], [], "", {"chars": 0, "tokens": 0, "chars_per_token": 0.0}, "", "" | |
| enc = tokenizer(text) | |
| tokens = enc.tokens() | |
| ids = enc["input_ids"] | |
| decoded = tokenizer.decode(ids, skip_special_tokens=True) | |
| chars = len(text) | |
| tok_count = len(ids) | |
| cpt = round(chars / max(tok_count, 1), 3) | |
| stats = {"chars": chars, "tokens": tok_count, "chars_per_token": cpt} | |
| colored_html = _colored_tokens_html(tokens) | |
| return tokens, ids, decoded, stats, colored_html, "" | |
| def tokenize_batch(multiline_text: str): | |
| if load_err: | |
| return "", load_err | |
| lines = [ln for ln in multiline_text.splitlines() if ln.strip()] | |
| if not lines: | |
| return {"input_ids": [], "attention_mask": []}, "" | |
| batch = tokenizer(lines, padding=True, truncation=True, max_length=256) | |
| # Return compact preview | |
| preview = { | |
| "num_examples": len(lines), | |
| "seq_len": len(batch["input_ids"][0]) if batch["input_ids"] else 0, | |
| "sample_input_ids": batch["input_ids"][0][:32] if batch["input_ids"] else [], | |
| "sample_attention_mask": batch["attention_mask"][0][:32] if batch["attention_mask"] else [], | |
| } | |
| return preview, "" | |
| def _colored_ids_html(ids): | |
| if not ids: | |
| return "" | |
| palette = [ | |
| "#e57373", "#64b5f6", "#81c784", "#ffd54f", "#ba68c8", | |
| "#4db6ac", "#ff8a65", "#9575cd", "#4fc3f7", "#aed581", | |
| ] | |
| spans = [] | |
| for i, idv in enumerate(ids): | |
| color = palette[i % len(palette)] | |
| safe = html.escape(str(idv)) | |
| spans.append( | |
| f'<span style="background:{color};padding:2px 6px;border-radius:4px;margin-right:2px;display:inline-block">{safe}</span>' | |
| ) | |
| return "".join(spans) | |
| def playground_render(text: str, view: str): | |
| if load_err: | |
| return "0", "0", "0.0", "<em>Tokenizer not loaded</em>" | |
| text = text or "" | |
| enc = tokenizer(text) | |
| tokens = enc.tokens() | |
| ids = enc["input_ids"] | |
| chars = len(text) | |
| toks = len(ids) | |
| ratio = round(chars / max(toks, 1), 3) | |
| if view == "Text": | |
| body = _colored_tokens_html(tokens) | |
| elif view == "Token IDs": | |
| body = _colored_ids_html(ids) | |
| else: | |
| body = "" | |
| return str(toks), str(chars), str(ratio), body | |
| with gr.Blocks(title="The Tokenizer Playground") as app: | |
| gr.Markdown("# The Tokenizer Playground") | |
| gr.Markdown("Experiment with different tokenizers (running locally in your browser).") | |
| if load_err: | |
| gr.Markdown(f"**Error:** {load_err}") | |
| inp = gr.Textbox(lines=10, placeholder="Enter Kannada text here…", show_label=False) | |
| # Stats row | |
| with gr.Row(): | |
| tokens_count = gr.HTML("<div style='text-align:center'><div>TOKENS</div><div style='font-size:36px;font-weight:700'>0</div></div>") | |
| chars_count = gr.HTML("<div style='text-align:center'><div>CHARACTERS</div><div style='font-size:36px;font-weight:700'>0</div></div>") | |
| ratio_count = gr.HTML("<div style='text-align:center'><div>COMPRESSION</div><div style='font-size:36px;font-weight:700'>0.0</div></div>") | |
| view = gr.Radio(["Text", "Token IDs", "Hide"], value="Text", label=None) | |
| viz = gr.HTML("") | |
| def _update(text, mode): | |
| tks, chs, ratio, body = playground_render(text, mode) | |
| tokens_html = f"<div style='text-align:center'><div>TOKENS</div><div style='font-size:36px;font-weight:700'>{tks}</div></div>" | |
| chars_html = f"<div style='text-align:center'><div>CHARACTERS</div><div style='font-size:36px;font-weight:700'>{chs}</div></div>" | |
| ratio_html = f"<div style='text-align:center'><div>COMPRESSION</div><div style='font-size:36px;font-weight:700'>{ratio}</div></div>" | |
| return tokens_html, chars_html, ratio_html, body | |
| inp.change(_update, inputs=[inp, view], outputs=[tokens_count, chars_count, ratio_count, viz]) | |
| view.change(_update, inputs=[inp, view], outputs=[tokens_count, chars_count, ratio_count, viz]) | |
| # Examples | |
| examples = [ | |
| ["೧೯೫೦ರಲ್ಲಿ ಸ್ವಾಮಿ ಭಾರತಕ್ಕೆ ಹಿಂದಿರುಗಿದರು.", "Text"], | |
| ["‘ಸ್ವಾಮಿಯಾನ’ ಪುಸ್ತಕದಲ್ಲಿ ಹಲವಾರು ಕ್ಷೇತ್ರದ ಗಣ್ಯರು ಸ್ವಾಮಿಯವರ ಬಗ್ಗೆ ಬರೆದ ಲೇಖನಗಳಿವೆ.", "Text"], | |
| ["ವೇದ, ವೇದಾಂತ, ಮೀಮಾಂಸೆ, ಶಾಸ್ತ್ರ, ಆಗಮಶಾಸ್ತ್ರ, ಜ್ಯೋತಿಷ್ಯಶಾಸ್ತ್ರ, ಶಿಲ್ಪಶಾಸ್ತ್ರ, ಸಂಗೀತ ಶಾಸ್ತ್ರ, ಹಾಗೂ ಆಯುರ್ವೇದ ಶಾಸ್ತ್ರ ಮುಂತಾದ ಶಾಸ್ತ್ರಗಳನ್ನು ಆಳವಾಗಿ ಅಭ್ಯಸಿಸಿ ಅವುಗಳಲ್ಲಿ ಮೇರು-ಪಾಂಡಿತ್ಯವನ್ನು ಸಂಪಾದಿಸಿದ್ದರು.", "Text"], | |
| ] | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[inp, view], | |
| outputs=[tokens_count, chars_count, ratio_count, viz], | |
| fn=_update, | |
| cache_examples=False, | |
| ) | |
| if __name__ == "__main__": | |
| try: | |
| app.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False) | |
| except Exception: | |
| # Fallback: default launch | |
| app.launch() | |