| """ |
| Tokenizer Inspector module. |
| |
| Provides tokenization utilities and a Gradio UI tab for inspecting how |
| different tokenizers handle input text. |
| """ |
|
|
| import html |
| import gradio as gr |
| from langdetect import detect, LangDetectException |
|
|
| _AutoTokenizer = None |
|
|
|
|
| def _get_auto_tokenizer(): |
| global _AutoTokenizer |
| if _AutoTokenizer is None: |
| from transformers import AutoTokenizer |
| _AutoTokenizer = AutoTokenizer |
| return _AutoTokenizer |
|
|
|
|
| class _LazyAutoTokenizer: |
| """Proxy that defers transformers import until first attribute access.""" |
|
|
| def __getattr__(self, name): |
| return getattr(_get_auto_tokenizer(), name) |
|
|
|
|
| AutoTokenizer = _LazyAutoTokenizer() |
|
|
| from openrouter import call_openrouter |
|
|
| |
| |
| |
|
|
| SUPPORTED_TOKENIZERS: dict[str, str] = { |
| "o200k_base": "tiktoken:o200k_base", |
| "cl100k_base": "tiktoken:cl100k_base", |
| "llama-3": "NousResearch/Meta-Llama-3-8B", |
| "mistral": "mistralai/Mistral-7B-v0.1", |
| "qwen-2.5": "Qwen/Qwen2.5-7B", |
| "gemma-2": "microsoft/phi-2", |
| "command-r": "bigscience/bloom-560m", |
| "gpt2": "gpt2", |
| } |
|
|
|
|
| class TiktokenAdapter: |
| """Wraps a tiktoken encoding to match the HuggingFace tokenizer interface. |
| |
| This allows tokenize_text(), fragmentation_ratio(), and other functions |
| to work identically with tiktoken-based and HF-based tokenizers. |
| """ |
|
|
| def __init__(self, encoding_name: str): |
| import tiktoken |
| self._enc = tiktoken.get_encoding(encoding_name) |
| self._encoding_name = encoding_name |
|
|
| def encode(self, text: str, add_special_tokens: bool = True) -> list[int]: |
| return self._enc.encode(text) |
|
|
| def decode(self, token_ids: list[int]) -> str: |
| return self._enc.decode(token_ids) |
|
|
| def convert_ids_to_tokens(self, token_ids: list[int]) -> list[str]: |
| return [self._enc.decode([tid]) for tid in token_ids] |
|
|
| def __repr__(self) -> str: |
| return f"TiktokenAdapter({self._encoding_name!r})" |
|
|
|
|
| |
| _tokenizer_cache: dict[str, object] = {} |
|
|
|
|
| def get_tokenizer(name: str): |
| """Return (and cache) a tokenizer for the given registry name. |
| |
| Args: |
| name: Key in SUPPORTED_TOKENIZERS (e.g. 'o200k_base', 'llama-3'). |
| |
| Returns: |
| A loaded tokenizer (AutoTokenizer or TiktokenAdapter). |
| |
| Raises: |
| ValueError: If name is not in SUPPORTED_TOKENIZERS. |
| """ |
| if name not in SUPPORTED_TOKENIZERS: |
| raise ValueError(f"unknown tokenizer: '{name}'. Choose from {list(SUPPORTED_TOKENIZERS)}") |
|
|
| if name not in _tokenizer_cache: |
| repo_id = SUPPORTED_TOKENIZERS[name] |
| if repo_id.startswith("tiktoken:"): |
| encoding_name = repo_id.split(":", 1)[1] |
| _tokenizer_cache[name] = TiktokenAdapter(encoding_name) |
| else: |
| try: |
| _tokenizer_cache[name] = AutoTokenizer.from_pretrained(repo_id) |
| except Exception as exc: |
| raise RuntimeError( |
| f"Failed to load tokenizer '{name}' from '{repo_id}'. " |
| f"Check your network connection or set TRANSFORMERS_OFFLINE=1 " |
| f"if you have a local cache. Original error: {exc}" |
| ) from exc |
|
|
| return _tokenizer_cache[name] |
|
|
|
|
| |
| |
| |
|
|
|
|
| def tokenize_text(text: str, tokenizer) -> list[dict]: |
| """Tokenize text and return a list of {token, id} dicts. |
| |
| Args: |
| text: Input string to tokenize. |
| tokenizer: A loaded AutoTokenizer (or compatible mock). |
| |
| Returns: |
| List of dicts with keys 'token' (str) and 'id' (int). |
| """ |
| token_ids = tokenizer.encode(text) |
| tokens = tokenizer.convert_ids_to_tokens(token_ids) |
| return [{"token": str(tok), "id": int(tid)} for tok, tid in zip(tokens, token_ids)] |
|
|
|
|
| def fragmentation_ratio(text: str, tokenizer) -> dict[str, float]: |
| """Compute the fragmentation ratio (tokens per word) for text. |
| |
| Args: |
| text: Input string. |
| tokenizer: A loaded AutoTokenizer. |
| |
| Returns: |
| Dict with: |
| - 'ratio': float tokens-per-word (0.0 when text is empty) |
| - 'token_count': int total token count |
| """ |
| token_ids = tokenizer.encode(text) |
| token_count = len(token_ids) |
| words = text.split() |
| word_count = len(words) |
| ratio = token_count / word_count if word_count > 0 else 0.0 |
| return {"ratio": float(ratio), "token_count": token_count} |
|
|
|
|
| def flag_oov_words(text: str, tokenizer, threshold: int = 3) -> set[str]: |
| """Return the set of words that fragment into >= threshold tokens. |
| |
| A word is considered out-of-vocabulary (OOV) relative to a tokenizer when |
| the tokenizer splits it into many sub-word pieces. |
| |
| Args: |
| text: Input string. |
| tokenizer: A loaded AutoTokenizer. |
| threshold: Minimum token count (inclusive) to flag a word. Default 3. |
| |
| Returns: |
| Set of words that meet or exceed the threshold. |
| """ |
| oov: set[str] = set() |
| for word in text.split(): |
| ids = tokenizer.encode(word, add_special_tokens=False) |
| if len(ids) >= threshold: |
| oov.add(word) |
| return oov |
|
|
|
|
| |
| |
| |
|
|
|
|
| def detect_language(text: str) -> str: |
| """Detect the language of text. |
| |
| Wraps langdetect.detect. Returns 'en' on LangDetectException. |
| |
| Args: |
| text: Input string. |
| |
| Returns: |
| BCP-47 language code string, e.g. 'en', 'fr', 'de'. |
| """ |
| try: |
| return detect(text) |
| except LangDetectException: |
| return "en" |
|
|
|
|
| def efficiency_score(input_tokens: int, english_tokens: int) -> float: |
| """Compute tokenization efficiency relative to an English translation. |
| |
| Score = english_tokens / input_tokens. Values > 1.0 indicate the source |
| language is more compact than English for this tokenizer; < 1.0 means |
| more tokens are needed. |
| |
| Args: |
| input_tokens: Token count for the original (possibly non-English) text. |
| english_tokens: Token count for the English translation. |
| |
| Returns: |
| Float ratio. Returns 1.0 when english_tokens is 0 or input_tokens is 0. |
| """ |
| if english_tokens == 0 or input_tokens == 0: |
| return 1.0 |
| return float(english_tokens) / float(input_tokens) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def relative_tokenization_cost(source_tokens: int, english_tokens: int) -> float: |
| """Relative Tokenization Cost: source_tokens / english_tokens. |
| |
| Values > 1.0 indicate the source language pays a "token tax" vs English. |
| Values < 1.0 indicate the source is more compact than English. |
| |
| Args: |
| source_tokens: Token count for the (possibly non-English) text. |
| english_tokens: Token count for the English equivalent. |
| |
| Returns: |
| Float ratio. Returns 1.0 when english_tokens is 0 (zero guard). |
| """ |
| if english_tokens == 0: |
| return 1.0 |
| return float(source_tokens) / float(english_tokens) |
|
|
|
|
| def byte_premium(text: str, english_text: str) -> float: |
| """Ratio of UTF-8 byte length of text vs english_text. |
| |
| Values > 1.0 indicate the source text uses more bytes than English |
| for equivalent content, reflecting script-level overhead. |
| |
| Args: |
| text: Source text. |
| english_text: English equivalent text. |
| |
| Returns: |
| Float ratio. Returns 1.0 when english_text is empty (zero guard). |
| """ |
| source_bytes = len(text.encode("utf-8")) |
| english_bytes = len(english_text.encode("utf-8")) |
| if english_bytes == 0: |
| return 1.0 |
| return float(source_bytes) / float(english_bytes) |
|
|
|
|
| def context_window_usage(token_count: int, window_size: int = 128_000) -> float: |
| """Fraction of a context window consumed by a token count. |
| |
| Args: |
| token_count: Number of tokens. |
| window_size: Total context window size. Default 128k. |
| |
| Returns: |
| Float between 0.0 and 1.0+. Returns 1.0 when window_size is 0. |
| """ |
| if window_size == 0: |
| return 1.0 |
| return float(token_count) / float(window_size) |
|
|
|
|
| def quality_risk_level(rtc: float) -> str: |
| """Map a Relative Tokenization Cost to a quality risk band. |
| |
| Based on multilingual tokenization research (2025-2026): |
| - low (<1.5): tokenizer handles this language well |
| - moderate (1.5-2.5): noticeable token inflation |
| - high (2.5-4.0): significant cost and potential quality impact |
| - severe (>= 4.0): extreme fragmentation, likely quality degradation |
| |
| Args: |
| rtc: Relative Tokenization Cost value. |
| |
| Returns: |
| One of "low", "moderate", "high", "severe". |
| """ |
| if rtc < 1.5: |
| return "low" |
| if rtc < 2.5: |
| return "moderate" |
| if rtc < 4.0: |
| return "high" |
| return "severe" |
|
|
|
|
| def translate_to_english(text: str, api_key: str) -> str: |
| """Translate text to English using OpenRouter. |
| |
| Args: |
| text: Source text to translate. |
| api_key: OpenRouter API key. |
| |
| Returns: |
| Translated English string. |
| """ |
| prompt = ( |
| f"Translate the following text to English. " |
| f"Return only the translation, no explanations.\n\nText: {text}" |
| ) |
| response = call_openrouter(api_key, "meta-llama/llama-3.1-8b-instruct", prompt) |
| return response["choices"][0]["message"]["content"] |
|
|
|
|
| |
| |
| |
|
|
| _NORMAL_BG_COLOURS = ("#e8f4f8", "#d4ecd4") |
|
|
|
|
| def render_tokens_html( |
| tokens: list[dict], |
| oov_words: set[str], |
| tokenizer=None, |
| decoded_view: bool = False, |
| hide_special_tokens: bool = True, |
| ) -> str: |
| """Render a list of tokens as coloured HTML spans. |
| |
| Normal tokens alternate between two background colours. OOV tokens are |
| highlighted with #ffcccc. All token text is HTML-escaped. |
| |
| When decoded_view=True, token IDs are decoded back into readable text |
| snippets. Special tokens (e.g. BOS) are hidden by default in this mode. |
| """ |
| special_ids = set(getattr(tokenizer, "all_special_ids", [])) if tokenizer else set() |
| parts: list[str] = [] |
| display_chunks: list[str] = [] |
| token_ids = [int(entry["id"]) for entry in tokens] |
| byte_decoder = getattr(tokenizer, "byte_decoder", None) if tokenizer else None |
| has_convert_tokens_to_string = bool(tokenizer and hasattr(tokenizer, "convert_tokens_to_string")) |
|
|
| if decoded_view and tokenizer is not None: |
| used_convert_path = False |
| |
| |
| if has_convert_tokens_to_string: |
| tmp_chunks: list[str] = [] |
| visible_tokens: list[str] = [] |
| prev_decoded = "" |
| convert_path_ok = True |
| for i, token_id in enumerate(token_ids): |
| if hide_special_tokens and token_id in special_ids: |
| tmp_chunks.append("") |
| continue |
| visible_tokens.append(str(tokens[i]["token"])) |
| try: |
| curr_decoded = tokenizer.convert_tokens_to_string(visible_tokens) |
| except Exception: |
| convert_path_ok = False |
| break |
| if not isinstance(curr_decoded, str): |
| convert_path_ok = False |
| break |
| safe_prev = prev_decoded.replace("\ufffd", "") |
| safe_curr = curr_decoded.replace("\ufffd", "") |
| chunk = safe_curr[len(safe_prev):] if safe_curr.startswith(safe_prev) else "" |
| tmp_chunks.append(chunk) |
| prev_decoded = curr_decoded |
| if convert_path_ok: |
| display_chunks = tmp_chunks |
| used_convert_path = True |
|
|
| |
| |
| if not used_convert_path and isinstance(byte_decoder, dict) and byte_decoder: |
| buffer = bytearray() |
| prev_decoded = "" |
| for i, token_id in enumerate(token_ids): |
| if hide_special_tokens and token_id in special_ids: |
| display_chunks.append("") |
| continue |
|
|
| raw_token = str(tokens[i]["token"]) |
| for ch in raw_token: |
| if ch in byte_decoder: |
| buffer.append(int(byte_decoder[ch])) |
| else: |
| buffer.extend(ch.encode("utf-8", errors="ignore")) |
|
|
| curr_decoded = bytes(buffer).decode("utf-8", errors="ignore") |
| chunk = curr_decoded[len(prev_decoded):] if curr_decoded.startswith(prev_decoded) else "" |
| display_chunks.append(chunk) |
| prev_decoded = curr_decoded |
| elif not used_convert_path: |
| |
| prev_decoded = "" |
| for idx, token_id in enumerate(token_ids): |
| if hide_special_tokens and token_id in special_ids: |
| display_chunks.append("") |
| continue |
| try: |
| curr_decoded = tokenizer.decode( |
| token_ids[: idx + 1], |
| skip_special_tokens=hide_special_tokens, |
| clean_up_tokenization_spaces=False, |
| ) |
| except Exception: |
| curr_decoded = prev_decoded |
|
|
| if curr_decoded.startswith(prev_decoded): |
| chunk = curr_decoded[len(prev_decoded):] |
| else: |
| |
| try: |
| chunk = tokenizer.decode( |
| [token_id], |
| skip_special_tokens=hide_special_tokens, |
| clean_up_tokenization_spaces=False, |
| ) |
| except Exception: |
| chunk = "" |
| |
| if "\ufffd" in chunk: |
| chunk = chunk.replace("\ufffd", "") |
| display_chunks.append(chunk) |
| prev_decoded = curr_decoded |
| else: |
| display_chunks = [str(entry["token"]) for entry in tokens] |
|
|
| for i, entry in enumerate(tokens): |
| raw_token = str(entry["token"]) |
| token_id = int(entry["id"]) |
| token_text_for_display = display_chunks[i] |
|
|
| if decoded_view and tokenizer is not None: |
| if not token_text_for_display and hide_special_tokens: |
| continue |
|
|
| token_text = html.escape(token_text_for_display) |
| if raw_token in oov_words: |
| bg = "#ffcccc" |
| else: |
| bg = _NORMAL_BG_COLOURS[i % 2] |
| title = f"id:{token_id}" if decoded_view else f"id:{token_id} | raw:{html.escape(raw_token)}" |
| parts.append( |
| f'<span style="background:{bg};padding:2px 4px;border-radius:3px;' |
| f'margin:1px;display:inline-block;white-space:pre;color:#000;" title="{title}">' |
| f"{token_text}</span>" |
| ) |
|
|
| return "".join(parts) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def _handle_single( |
| model_name: str, |
| text: str, |
| threshold: int, |
| decoded_view: bool, |
| english_text: str = "", |
| ): |
| """Handler logic for the Single tab — extracted for testability.""" |
| try: |
| tok = get_tokenizer(model_name) |
| tokens = tokenize_text(text, tok) |
| token_count = len(tokens) |
| oov = flag_oov_words(text, tok, threshold=int(threshold)) |
| token_html = render_tokens_html( |
| tokens, oov, tokenizer=tok, |
| decoded_view=decoded_view, hide_special_tokens=True, |
| ) |
| frag = fragmentation_ratio(text, tok) |
| lang = detect_language(text) |
|
|
| ctx_usage = context_window_usage(token_count, 128_000) |
|
|
| stats = ( |
| f"**Tokens:** {frag['token_count']} \n" |
| f"**Fragmentation ratio:** {frag['ratio']:.2f} \n" |
| f"**OOV words:** {len(oov)} \n" |
| f"**Detected language:** {lang} \n" |
| f"**Context usage (128k):** {ctx_usage:.4%}" |
| ) |
|
|
| if lang == "en": |
| stats += ( |
| f" \n**RTC vs English:** 1.0x \n" |
| f"**Quality risk:** low" |
| ) |
| elif english_text and english_text.strip(): |
| eng_tokens = tokenize_text(english_text.strip(), tok) |
| rtc = relative_tokenization_cost(token_count, len(eng_tokens)) |
| risk = quality_risk_level(rtc) |
| stats += ( |
| f" \n**RTC vs English:** {rtc:.2f}x \n" |
| f"**Quality risk:** {risk}" |
| ) |
| else: |
| stats += ( |
| " \n**RTC:** *(provide English equivalent for comparison)*" |
| ) |
|
|
| return token_html, stats |
| except Exception as exc: |
| return "", f"Error: {exc}" |
|
|
|
|
| def _handle_compare( |
| text: str, |
| name_a: str, |
| name_b: str, |
| decoded_view: bool, |
| english_text: str = "", |
| ): |
| """Handler logic for the Compare tab — extracted for testability.""" |
| try: |
| tok_a = get_tokenizer(name_a) |
| tok_b = get_tokenizer(name_b) |
| tokens_a = tokenize_text(text, tok_a) |
| tokens_b = tokenize_text(text, tok_b) |
| count_a = len(tokens_a) |
| count_b = len(tokens_b) |
| html_a = render_tokens_html( |
| tokens_a, set(), tokenizer=tok_a, |
| decoded_view=decoded_view, hide_special_tokens=True, |
| ) |
| html_b = render_tokens_html( |
| tokens_b, set(), tokenizer=tok_b, |
| decoded_view=decoded_view, hide_special_tokens=True, |
| ) |
| frag_a = fragmentation_ratio(text, tok_a) |
| frag_b = fragmentation_ratio(text, tok_b) |
| ratio_md = ( |
| f"**{name_a}:** {frag_a['token_count']} tokens " |
| f"(ratio {frag_a['ratio']:.2f}) \n" |
| f"**{name_b}:** {frag_b['token_count']} tokens " |
| f"(ratio {frag_b['ratio']:.2f})" |
| ) |
|
|
| if english_text and english_text.strip(): |
| eng_tokens_a = tokenize_text(english_text.strip(), tok_a) |
| eng_tokens_b = tokenize_text(english_text.strip(), tok_b) |
| rtc_a = relative_tokenization_cost(count_a, len(eng_tokens_a)) |
| rtc_b = relative_tokenization_cost(count_b, len(eng_tokens_b)) |
| ratio_md += ( |
| f" \n**{name_a} RTC:** {rtc_a:.2f}x \n" |
| f"**{name_b} RTC:** {rtc_b:.2f}x" |
| ) |
| if rtc_a != rtc_b: |
| better = name_a if rtc_a < rtc_b else name_b |
| ratio_md += ( |
| f" \n*{better} is more efficient for this language.*" |
| ) |
|
|
| return html_a, html_b, ratio_md |
| except Exception as exc: |
| return "", "", f"Error: {exc}" |
|
|
|
|
| def build_tokenizer_ui() -> gr.Blocks: |
| """Construct and return the Tokenizer Inspector Gradio Blocks UI. |
| |
| Returns: |
| gr.Blocks instance with two inner tabs: |
| - Single: inspect tokenization of one model. |
| - Compare: side-by-side comparison of two models. |
| """ |
| tokenizer_names = list(SUPPORTED_TOKENIZERS.keys()) |
|
|
| with gr.Blocks(title="Tokenizer Inspector") as demo: |
| gr.Markdown("## Tokenizer Inspector\nExplore how different tokenizers split text.") |
|
|
| with gr.Tabs(): |
| |
| with gr.TabItem("Single"): |
| with gr.Row(): |
| single_model = gr.Dropdown( |
| choices=tokenizer_names, |
| value=tokenizer_names[0], |
| label="Tokenizer", |
| ) |
| single_text = gr.Textbox( |
| label="Input Text", |
| placeholder="Type text to tokenize...", |
| lines=3, |
| ) |
| oov_threshold = gr.Slider( |
| minimum=1, maximum=10, value=3, step=1, |
| label="OOV threshold (tokens per word)", |
| ) |
| single_english_text = gr.Textbox( |
| label="English Equivalent (optional)", |
| placeholder="Paste English translation for RTC comparison...", |
| lines=2, |
| ) |
| single_decoded_view = gr.Checkbox( |
| label="Readable token view (decode tokens, hide special tokens)", |
| value=False, |
| ) |
| single_btn = gr.Button("Tokenize", variant="primary") |
| single_html = gr.HTML(label="Token Visualisation") |
| single_stats = gr.Markdown(label="Statistics") |
|
|
| single_btn.click( |
| fn=_handle_single, |
| inputs=[single_model, single_text, oov_threshold, single_decoded_view, single_english_text], |
| outputs=[single_html, single_stats], |
| ) |
|
|
| |
| with gr.TabItem("Compare"): |
| compare_text = gr.Textbox( |
| label="Input Text", |
| placeholder="Type text to compare tokenizers...", |
| lines=3, |
| ) |
| with gr.Row(): |
| cmp_model_a = gr.Dropdown( |
| choices=tokenizer_names, |
| value=tokenizer_names[0], |
| label="Tokenizer A", |
| ) |
| cmp_model_b = gr.Dropdown( |
| choices=tokenizer_names, |
| value=tokenizer_names[1] if len(tokenizer_names) > 1 else tokenizer_names[0], |
| label="Tokenizer B", |
| ) |
| compare_english_text = gr.Textbox( |
| label="English Equivalent (optional)", |
| placeholder="Paste English translation for RTC comparison...", |
| lines=2, |
| ) |
| compare_btn = gr.Button("Compare", variant="primary") |
| with gr.Row(): |
| cmp_html_a = gr.HTML(label="Tokenizer A") |
| cmp_html_b = gr.HTML(label="Tokenizer B") |
| compare_decoded_view = gr.Checkbox( |
| label="Readable token view (decode tokens, hide special tokens)", |
| value=False, |
| ) |
| cmp_ratio_md = gr.Markdown(label="Comparison") |
|
|
| compare_btn.click( |
| fn=_handle_compare, |
| inputs=[compare_text, cmp_model_a, cmp_model_b, compare_decoded_view, compare_english_text], |
| outputs=[cmp_html_a, cmp_html_b, cmp_ratio_md], |
| ) |
|
|
| return demo |
|
|