Spaces:
Sleeping
Sleeping
File size: 6,504 Bytes
338aa93 61346c0 338aa93 193251c 338aa93 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import sys
from pathlib import Path
import html
import gradio as gr
from transformers import PreTrainedTokenizerFast
def load_tokenizer():
tok_path = Path("kn_bpe_8000.json")
if not tok_path.exists():
return None, f"Tokenizer not found at {tok_path}. Run build_kn_bpe.py first."
tok = PreTrainedTokenizerFast(
tokenizer_file=str(tok_path),
unk_token="[UNK]",
pad_token="[PAD]",
bos_token="[BOS]",
eos_token="[EOS]",
sep_token="[SEP]",
mask_token="[MASK]",
)
return tok, None
tokenizer, load_err = load_tokenizer()
def _colored_tokens_html(tokens):
if not tokens:
return ""
palette = [
"#e57373", "#64b5f6", "#81c784", "#ffd54f", "#ba68c8",
"#4db6ac", "#ff8a65", "#9575cd", "#4fc3f7", "#aed581",
]
spans = []
for i, t in enumerate(tokens):
color = palette[i % len(palette)]
safe = html.escape(t)
spans.append(
f'<span style="background:{color};padding:2px 4px;border-radius:4px;margin-right:2px;display:inline-block">{safe}</span>'
)
return "".join(spans)
def tokenize_single(text: str):
if load_err:
return "", "", "", "", "", load_err
if not text:
return [], [], "", {"chars": 0, "tokens": 0, "chars_per_token": 0.0}, "", ""
enc = tokenizer(text)
tokens = enc.tokens()
ids = enc["input_ids"]
decoded = tokenizer.decode(ids, skip_special_tokens=True)
chars = len(text)
tok_count = len(ids)
cpt = round(chars / max(tok_count, 1), 3)
stats = {"chars": chars, "tokens": tok_count, "chars_per_token": cpt}
colored_html = _colored_tokens_html(tokens)
return tokens, ids, decoded, stats, colored_html, ""
def tokenize_batch(multiline_text: str):
if load_err:
return "", load_err
lines = [ln for ln in multiline_text.splitlines() if ln.strip()]
if not lines:
return {"input_ids": [], "attention_mask": []}, ""
batch = tokenizer(lines, padding=True, truncation=True, max_length=256)
# Return compact preview
preview = {
"num_examples": len(lines),
"seq_len": len(batch["input_ids"][0]) if batch["input_ids"] else 0,
"sample_input_ids": batch["input_ids"][0][:32] if batch["input_ids"] else [],
"sample_attention_mask": batch["attention_mask"][0][:32] if batch["attention_mask"] else [],
}
return preview, ""
def _colored_ids_html(ids):
if not ids:
return ""
palette = [
"#e57373", "#64b5f6", "#81c784", "#ffd54f", "#ba68c8",
"#4db6ac", "#ff8a65", "#9575cd", "#4fc3f7", "#aed581",
]
spans = []
for i, idv in enumerate(ids):
color = palette[i % len(palette)]
safe = html.escape(str(idv))
spans.append(
f'<span style="background:{color};padding:2px 6px;border-radius:4px;margin-right:2px;display:inline-block">{safe}</span>'
)
return "".join(spans)
def playground_render(text: str, view: str):
if load_err:
return "0", "0", "0.0", "<em>Tokenizer not loaded</em>"
text = text or ""
enc = tokenizer(text)
tokens = enc.tokens()
ids = enc["input_ids"]
chars = len(text)
toks = len(ids)
ratio = round(chars / max(toks, 1), 3)
if view == "Text":
body = _colored_tokens_html(tokens)
elif view == "Token IDs":
body = _colored_ids_html(ids)
else:
body = ""
return str(toks), str(chars), str(ratio), body
with gr.Blocks(title="The Tokenizer Playground") as app:
gr.Markdown("# The Tokenizer Playground")
gr.Markdown("Experiment with different tokenizers (running locally in your browser).")
if load_err:
gr.Markdown(f"**Error:** {load_err}")
inp = gr.Textbox(lines=10, placeholder="Enter Kannada text here…", show_label=False)
# Stats row
with gr.Row():
tokens_count = gr.HTML("<div style='text-align:center'><div>TOKENS</div><div style='font-size:36px;font-weight:700'>0</div></div>")
chars_count = gr.HTML("<div style='text-align:center'><div>CHARACTERS</div><div style='font-size:36px;font-weight:700'>0</div></div>")
ratio_count = gr.HTML("<div style='text-align:center'><div>COMPRESSION</div><div style='font-size:36px;font-weight:700'>0.0</div></div>")
view = gr.Radio(["Text", "Token IDs", "Hide"], value="Text", label=None)
viz = gr.HTML("")
def _update(text, mode):
tks, chs, ratio, body = playground_render(text, mode)
tokens_html = f"<div style='text-align:center'><div>TOKENS</div><div style='font-size:36px;font-weight:700'>{tks}</div></div>"
chars_html = f"<div style='text-align:center'><div>CHARACTERS</div><div style='font-size:36px;font-weight:700'>{chs}</div></div>"
ratio_html = f"<div style='text-align:center'><div>COMPRESSION</div><div style='font-size:36px;font-weight:700'>{ratio}</div></div>"
return tokens_html, chars_html, ratio_html, body
inp.change(_update, inputs=[inp, view], outputs=[tokens_count, chars_count, ratio_count, viz])
view.change(_update, inputs=[inp, view], outputs=[tokens_count, chars_count, ratio_count, viz])
# Examples
examples = [
["೧೯೫೦ರಲ್ಲಿ ಸ್ವಾಮಿ ಭಾರತಕ್ಕೆ ಹಿಂದಿರುಗಿದರು.", "Text"],
["‘ಸ್ವಾಮಿಯಾನ’ ಪುಸ್ತಕದಲ್ಲಿ ಹಲವಾರು ಕ್ಷೇತ್ರದ ಗಣ್ಯರು ಸ್ವಾಮಿಯವರ ಬಗ್ಗೆ ಬರೆದ ಲೇಖನಗಳಿವೆ.", "Text"],
["ವೇದ, ವೇದಾಂತ, ಮೀಮಾಂಸೆ, ಶಾಸ್ತ್ರ, ಆಗಮಶಾಸ್ತ್ರ, ಜ್ಯೋತಿಷ್ಯಶಾಸ್ತ್ರ, ಶಿಲ್ಪಶಾಸ್ತ್ರ, ಸಂಗೀತ ಶಾಸ್ತ್ರ, ಹಾಗೂ ಆಯುರ್ವೇದ ಶಾಸ್ತ್ರ ಮುಂತಾದ ಶಾಸ್ತ್ರಗಳನ್ನು ಆಳವಾಗಿ ಅಭ್ಯಸಿಸಿ ಅವುಗಳಲ್ಲಿ ಮೇರು-ಪಾಂಡಿತ್ಯವನ್ನು ಸಂಪಾದಿಸಿದ್ದರು.", "Text"],
]
gr.Examples(
examples=examples,
inputs=[inp, view],
outputs=[tokens_count, chars_count, ratio_count, viz],
fn=_update,
cache_examples=False,
)
if __name__ == "__main__":
try:
app.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
except Exception:
# Fallback: default launch
app.launch()
|