Spaces:

nad707
/

llm-workbench

Runtime error

File size: 23,685 Bytes

bf96836

"""
Tokenizer Inspector module.

Provides tokenization utilities and a Gradio UI tab for inspecting how
different tokenizers handle input text.
"""

import html
import gradio as gr
from langdetect import detect, LangDetectException

_AutoTokenizer = None


def _get_auto_tokenizer():
    global _AutoTokenizer
    if _AutoTokenizer is None:
        from transformers import AutoTokenizer
        _AutoTokenizer = AutoTokenizer
    return _AutoTokenizer


class _LazyAutoTokenizer:
    """Proxy that defers transformers import until first attribute access."""

    def __getattr__(self, name):
        return getattr(_get_auto_tokenizer(), name)


AutoTokenizer = _LazyAutoTokenizer()

from openrouter import call_openrouter

# ---------------------------------------------------------------------------
# Tokenizer registry
# ---------------------------------------------------------------------------

SUPPORTED_TOKENIZERS: dict[str, str] = {
    "o200k_base": "tiktoken:o200k_base",
    "cl100k_base": "tiktoken:cl100k_base",
    "llama-3": "NousResearch/Meta-Llama-3-8B",
    "mistral": "mistralai/Mistral-7B-v0.1",
    "qwen-2.5": "Qwen/Qwen2.5-7B",
    "gemma-2": "microsoft/phi-2",
    "command-r": "bigscience/bloom-560m",
    "gpt2": "gpt2",
}


class TiktokenAdapter:
    """Wraps a tiktoken encoding to match the HuggingFace tokenizer interface.

    This allows tokenize_text(), fragmentation_ratio(), and other functions
    to work identically with tiktoken-based and HF-based tokenizers.
    """

    def __init__(self, encoding_name: str):
        import tiktoken
        self._enc = tiktoken.get_encoding(encoding_name)
        self._encoding_name = encoding_name

    def encode(self, text: str, add_special_tokens: bool = True) -> list[int]:
        return self._enc.encode(text)

    def decode(self, token_ids: list[int]) -> str:
        return self._enc.decode(token_ids)

    def convert_ids_to_tokens(self, token_ids: list[int]) -> list[str]:
        return [self._enc.decode([tid]) for tid in token_ids]

    def __repr__(self) -> str:
        return f"TiktokenAdapter({self._encoding_name!r})"


# Module-level cache: name -> tokenizer object
_tokenizer_cache: dict[str, object] = {}


def get_tokenizer(name: str):
    """Return (and cache) a tokenizer for the given registry name.

    Args:
        name: Key in SUPPORTED_TOKENIZERS (e.g. 'o200k_base', 'llama-3').

    Returns:
        A loaded tokenizer (AutoTokenizer or TiktokenAdapter).

    Raises:
        ValueError: If name is not in SUPPORTED_TOKENIZERS.
    """
    if name not in SUPPORTED_TOKENIZERS:
        raise ValueError(f"unknown tokenizer: '{name}'. Choose from {list(SUPPORTED_TOKENIZERS)}")

    if name not in _tokenizer_cache:
        repo_id = SUPPORTED_TOKENIZERS[name]
        if repo_id.startswith("tiktoken:"):
            encoding_name = repo_id.split(":", 1)[1]
            _tokenizer_cache[name] = TiktokenAdapter(encoding_name)
        else:
            try:
                _tokenizer_cache[name] = AutoTokenizer.from_pretrained(repo_id)
            except Exception as exc:
                raise RuntimeError(
                    f"Failed to load tokenizer '{name}' from '{repo_id}'. "
                    f"Check your network connection or set TRANSFORMERS_OFFLINE=1 "
                    f"if you have a local cache. Original error: {exc}"
                ) from exc

    return _tokenizer_cache[name]


# ---------------------------------------------------------------------------
# Core tokenization helpers
# ---------------------------------------------------------------------------


def tokenize_text(text: str, tokenizer) -> list[dict]:
    """Tokenize text and return a list of {token, id} dicts.

    Args:
        text:      Input string to tokenize.
        tokenizer: A loaded AutoTokenizer (or compatible mock).

    Returns:
        List of dicts with keys 'token' (str) and 'id' (int).
    """
    token_ids = tokenizer.encode(text)
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    return [{"token": str(tok), "id": int(tid)} for tok, tid in zip(tokens, token_ids)]


def fragmentation_ratio(text: str, tokenizer) -> dict[str, float]:
    """Compute the fragmentation ratio (tokens per word) for text.

    Args:
        text:      Input string.
        tokenizer: A loaded AutoTokenizer.

    Returns:
        Dict with:
          - 'ratio': float tokens-per-word (0.0 when text is empty)
          - 'token_count': int total token count
    """
    token_ids = tokenizer.encode(text)
    token_count = len(token_ids)
    words = text.split()
    word_count = len(words)
    ratio = token_count / word_count if word_count > 0 else 0.0
    return {"ratio": float(ratio), "token_count": token_count}


def flag_oov_words(text: str, tokenizer, threshold: int = 3) -> set[str]:
    """Return the set of words that fragment into >= threshold tokens.

    A word is considered out-of-vocabulary (OOV) relative to a tokenizer when
    the tokenizer splits it into many sub-word pieces.

    Args:
        text:      Input string.
        tokenizer: A loaded AutoTokenizer.
        threshold: Minimum token count (inclusive) to flag a word. Default 3.

    Returns:
        Set of words that meet or exceed the threshold.
    """
    oov: set[str] = set()
    for word in text.split():
        ids = tokenizer.encode(word, add_special_tokens=False)
        if len(ids) >= threshold:
            oov.add(word)
    return oov


# ---------------------------------------------------------------------------
# Language helpers
# ---------------------------------------------------------------------------


def detect_language(text: str) -> str:
    """Detect the language of text.

    Wraps langdetect.detect. Returns 'en' on LangDetectException.

    Args:
        text: Input string.

    Returns:
        BCP-47 language code string, e.g. 'en', 'fr', 'de'.
    """
    try:
        return detect(text)
    except LangDetectException:
        return "en"


def efficiency_score(input_tokens: int, english_tokens: int) -> float:
    """Compute tokenization efficiency relative to an English translation.

    Score = english_tokens / input_tokens. Values > 1.0 indicate the source
    language is more compact than English for this tokenizer; < 1.0 means
    more tokens are needed.

    Args:
        input_tokens:   Token count for the original (possibly non-English) text.
        english_tokens: Token count for the English translation.

    Returns:
        Float ratio. Returns 1.0 when english_tokens is 0 or input_tokens is 0.
    """
    if english_tokens == 0 or input_tokens == 0:
        return 1.0
    return float(english_tokens) / float(input_tokens)


# ---------------------------------------------------------------------------
# Token tax metrics (GH-3)
# ---------------------------------------------------------------------------


def relative_tokenization_cost(source_tokens: int, english_tokens: int) -> float:
    """Relative Tokenization Cost: source_tokens / english_tokens.

    Values > 1.0 indicate the source language pays a "token tax" vs English.
    Values < 1.0 indicate the source is more compact than English.

    Args:
        source_tokens:  Token count for the (possibly non-English) text.
        english_tokens: Token count for the English equivalent.

    Returns:
        Float ratio. Returns 1.0 when english_tokens is 0 (zero guard).
    """
    if english_tokens == 0:
        return 1.0
    return float(source_tokens) / float(english_tokens)


def byte_premium(text: str, english_text: str) -> float:
    """Ratio of UTF-8 byte length of text vs english_text.

    Values > 1.0 indicate the source text uses more bytes than English
    for equivalent content, reflecting script-level overhead.

    Args:
        text:         Source text.
        english_text: English equivalent text.

    Returns:
        Float ratio. Returns 1.0 when english_text is empty (zero guard).
    """
    source_bytes = len(text.encode("utf-8"))
    english_bytes = len(english_text.encode("utf-8"))
    if english_bytes == 0:
        return 1.0
    return float(source_bytes) / float(english_bytes)


def context_window_usage(token_count: int, window_size: int = 128_000) -> float:
    """Fraction of a context window consumed by a token count.

    Args:
        token_count: Number of tokens.
        window_size: Total context window size. Default 128k.

    Returns:
        Float between 0.0 and 1.0+. Returns 1.0 when window_size is 0.
    """
    if window_size == 0:
        return 1.0
    return float(token_count) / float(window_size)


def quality_risk_level(rtc: float) -> str:
    """Map a Relative Tokenization Cost to a quality risk band.

    Based on multilingual tokenization research (2025-2026):
    - low (<1.5): tokenizer handles this language well
    - moderate (1.5-2.5): noticeable token inflation
    - high (2.5-4.0): significant cost and potential quality impact
    - severe (>= 4.0): extreme fragmentation, likely quality degradation

    Args:
        rtc: Relative Tokenization Cost value.

    Returns:
        One of "low", "moderate", "high", "severe".
    """
    if rtc < 1.5:
        return "low"
    if rtc < 2.5:
        return "moderate"
    if rtc < 4.0:
        return "high"
    return "severe"


def translate_to_english(text: str, api_key: str) -> str:
    """Translate text to English using OpenRouter.

    Args:
        text:    Source text to translate.
        api_key: OpenRouter API key.

    Returns:
        Translated English string.
    """
    prompt = (
        f"Translate the following text to English. "
        f"Return only the translation, no explanations.\n\nText: {text}"
    )
    response = call_openrouter(api_key, "meta-llama/llama-3.1-8b-instruct", prompt)
    return response["choices"][0]["message"]["content"]


# ---------------------------------------------------------------------------
# HTML rendering
# ---------------------------------------------------------------------------

_NORMAL_BG_COLOURS = ("#e8f4f8", "#d4ecd4")


def render_tokens_html(
    tokens: list[dict],
    oov_words: set[str],
    tokenizer=None,
    decoded_view: bool = False,
    hide_special_tokens: bool = True,
) -> str:
    """Render a list of tokens as coloured HTML spans.

    Normal tokens alternate between two background colours. OOV tokens are
    highlighted with #ffcccc. All token text is HTML-escaped.

    When decoded_view=True, token IDs are decoded back into readable text
    snippets. Special tokens (e.g. BOS) are hidden by default in this mode.
    """
    special_ids = set(getattr(tokenizer, "all_special_ids", [])) if tokenizer else set()
    parts: list[str] = []
    display_chunks: list[str] = []
    token_ids = [int(entry["id"]) for entry in tokens]
    byte_decoder = getattr(tokenizer, "byte_decoder", None) if tokenizer else None
    has_convert_tokens_to_string = bool(tokenizer and hasattr(tokenizer, "convert_tokens_to_string"))

    if decoded_view and tokenizer is not None:
        used_convert_path = False
        # Prefer tokenizer-native token-string reconstruction when available.
        # This handles byte-level tokenizers more reliably than per-token decode.
        if has_convert_tokens_to_string:
            tmp_chunks: list[str] = []
            visible_tokens: list[str] = []
            prev_decoded = ""
            convert_path_ok = True
            for i, token_id in enumerate(token_ids):
                if hide_special_tokens and token_id in special_ids:
                    tmp_chunks.append("")
                    continue
                visible_tokens.append(str(tokens[i]["token"]))
                try:
                    curr_decoded = tokenizer.convert_tokens_to_string(visible_tokens)
                except Exception:
                    convert_path_ok = False
                    break
                if not isinstance(curr_decoded, str):
                    convert_path_ok = False
                    break
                safe_prev = prev_decoded.replace("\ufffd", "")
                safe_curr = curr_decoded.replace("\ufffd", "")
                chunk = safe_curr[len(safe_prev):] if safe_curr.startswith(safe_prev) else ""
                tmp_chunks.append(chunk)
                prev_decoded = curr_decoded
            if convert_path_ok:
                display_chunks = tmp_chunks
                used_convert_path = True

        # For byte-level tokenizers (GPT-2/Llama-family), decode via raw byte
        # accumulation to avoid replacement-character noise in multibyte scripts.
        if not used_convert_path and isinstance(byte_decoder, dict) and byte_decoder:
            buffer = bytearray()
            prev_decoded = ""
            for i, token_id in enumerate(token_ids):
                if hide_special_tokens and token_id in special_ids:
                    display_chunks.append("")
                    continue

                raw_token = str(tokens[i]["token"])
                for ch in raw_token:
                    if ch in byte_decoder:
                        buffer.append(int(byte_decoder[ch]))
                    else:
                        buffer.extend(ch.encode("utf-8", errors="ignore"))

                curr_decoded = bytes(buffer).decode("utf-8", errors="ignore")
                chunk = curr_decoded[len(prev_decoded):] if curr_decoded.startswith(prev_decoded) else ""
                display_chunks.append(chunk)
                prev_decoded = curr_decoded
        elif not used_convert_path:
            # Generic fallback: cumulative tokenizer decode + prefix diff.
            prev_decoded = ""
            for idx, token_id in enumerate(token_ids):
                if hide_special_tokens and token_id in special_ids:
                    display_chunks.append("")
                    continue
                try:
                    curr_decoded = tokenizer.decode(
                        token_ids[: idx + 1],
                        skip_special_tokens=hide_special_tokens,
                        clean_up_tokenization_spaces=False,
                    )
                except Exception:
                    curr_decoded = prev_decoded

                if curr_decoded.startswith(prev_decoded):
                    chunk = curr_decoded[len(prev_decoded):]
                else:
                    # Conservative fallback if tokenizer decode is non-prefix-stable.
                    try:
                        chunk = tokenizer.decode(
                            [token_id],
                            skip_special_tokens=hide_special_tokens,
                            clean_up_tokenization_spaces=False,
                        )
                    except Exception:
                        chunk = ""
                # Strip replacement chars in readable mode.
                if "\ufffd" in chunk:
                    chunk = chunk.replace("\ufffd", "")
                display_chunks.append(chunk)
                prev_decoded = curr_decoded
    else:
        display_chunks = [str(entry["token"]) for entry in tokens]

    for i, entry in enumerate(tokens):
        raw_token = str(entry["token"])
        token_id = int(entry["id"])
        token_text_for_display = display_chunks[i]

        if decoded_view and tokenizer is not None:
            if not token_text_for_display and hide_special_tokens:
                continue

        token_text = html.escape(token_text_for_display)
        if raw_token in oov_words:
            bg = "#ffcccc"
        else:
            bg = _NORMAL_BG_COLOURS[i % 2]
        title = f"id:{token_id}" if decoded_view else f"id:{token_id} | raw:{html.escape(raw_token)}"
        parts.append(
            f'<span style="background:{bg};padding:2px 4px;border-radius:3px;'
            f'margin:1px;display:inline-block;white-space:pre;color:#000;" title="{title}">'
            f"{token_text}</span>"
        )

    return "".join(parts)


# ---------------------------------------------------------------------------
# Gradio UI
# ---------------------------------------------------------------------------


def _handle_single(
    model_name: str,
    text: str,
    threshold: int,
    decoded_view: bool,
    english_text: str = "",
):
    """Handler logic for the Single tab — extracted for testability."""
    try:
        tok = get_tokenizer(model_name)
        tokens = tokenize_text(text, tok)
        token_count = len(tokens)
        oov = flag_oov_words(text, tok, threshold=int(threshold))
        token_html = render_tokens_html(
            tokens, oov, tokenizer=tok,
            decoded_view=decoded_view, hide_special_tokens=True,
        )
        frag = fragmentation_ratio(text, tok)
        lang = detect_language(text)

        ctx_usage = context_window_usage(token_count, 128_000)

        stats = (
            f"**Tokens:** {frag['token_count']}  \n"
            f"**Fragmentation ratio:** {frag['ratio']:.2f}  \n"
            f"**OOV words:** {len(oov)}  \n"
            f"**Detected language:** {lang}  \n"
            f"**Context usage (128k):** {ctx_usage:.4%}"
        )

        if lang == "en":
            stats += (
                f"  \n**RTC vs English:** 1.0x  \n"
                f"**Quality risk:** low"
            )
        elif english_text and english_text.strip():
            eng_tokens = tokenize_text(english_text.strip(), tok)
            rtc = relative_tokenization_cost(token_count, len(eng_tokens))
            risk = quality_risk_level(rtc)
            stats += (
                f"  \n**RTC vs English:** {rtc:.2f}x  \n"
                f"**Quality risk:** {risk}"
            )
        else:
            stats += (
                "  \n**RTC:** *(provide English equivalent for comparison)*"
            )

        return token_html, stats
    except Exception as exc:
        return "", f"Error: {exc}"


def _handle_compare(
    text: str,
    name_a: str,
    name_b: str,
    decoded_view: bool,
    english_text: str = "",
):
    """Handler logic for the Compare tab — extracted for testability."""
    try:
        tok_a = get_tokenizer(name_a)
        tok_b = get_tokenizer(name_b)
        tokens_a = tokenize_text(text, tok_a)
        tokens_b = tokenize_text(text, tok_b)
        count_a = len(tokens_a)
        count_b = len(tokens_b)
        html_a = render_tokens_html(
            tokens_a, set(), tokenizer=tok_a,
            decoded_view=decoded_view, hide_special_tokens=True,
        )
        html_b = render_tokens_html(
            tokens_b, set(), tokenizer=tok_b,
            decoded_view=decoded_view, hide_special_tokens=True,
        )
        frag_a = fragmentation_ratio(text, tok_a)
        frag_b = fragmentation_ratio(text, tok_b)
        ratio_md = (
            f"**{name_a}:** {frag_a['token_count']} tokens "
            f"(ratio {frag_a['ratio']:.2f})  \n"
            f"**{name_b}:** {frag_b['token_count']} tokens "
            f"(ratio {frag_b['ratio']:.2f})"
        )

        if english_text and english_text.strip():
            eng_tokens_a = tokenize_text(english_text.strip(), tok_a)
            eng_tokens_b = tokenize_text(english_text.strip(), tok_b)
            rtc_a = relative_tokenization_cost(count_a, len(eng_tokens_a))
            rtc_b = relative_tokenization_cost(count_b, len(eng_tokens_b))
            ratio_md += (
                f"  \n**{name_a} RTC:** {rtc_a:.2f}x  \n"
                f"**{name_b} RTC:** {rtc_b:.2f}x"
            )
            if rtc_a != rtc_b:
                better = name_a if rtc_a < rtc_b else name_b
                ratio_md += (
                    f"  \n*{better} is more efficient for this language.*"
                )

        return html_a, html_b, ratio_md
    except Exception as exc:
        return "", "", f"Error: {exc}"


def build_tokenizer_ui() -> gr.Blocks:
    """Construct and return the Tokenizer Inspector Gradio Blocks UI.

    Returns:
        gr.Blocks instance with two inner tabs:
          - Single: inspect tokenization of one model.
          - Compare: side-by-side comparison of two models.
    """
    tokenizer_names = list(SUPPORTED_TOKENIZERS.keys())

    with gr.Blocks(title="Tokenizer Inspector") as demo:
        gr.Markdown("## Tokenizer Inspector\nExplore how different tokenizers split text.")

        with gr.Tabs():
            # --- Single tab ---
            with gr.TabItem("Single"):
                with gr.Row():
                    single_model = gr.Dropdown(
                        choices=tokenizer_names,
                        value=tokenizer_names[0],
                        label="Tokenizer",
                    )
                single_text = gr.Textbox(
                    label="Input Text",
                    placeholder="Type text to tokenize...",
                    lines=3,
                )
                oov_threshold = gr.Slider(
                    minimum=1, maximum=10, value=3, step=1,
                    label="OOV threshold (tokens per word)",
                )
                single_english_text = gr.Textbox(
                    label="English Equivalent (optional)",
                    placeholder="Paste English translation for RTC comparison...",
                    lines=2,
                )
                single_decoded_view = gr.Checkbox(
                    label="Readable token view (decode tokens, hide special tokens)",
                    value=False,
                )
                single_btn = gr.Button("Tokenize", variant="primary")
                single_html = gr.HTML(label="Token Visualisation")
                single_stats = gr.Markdown(label="Statistics")

                single_btn.click(
                    fn=_handle_single,
                    inputs=[single_model, single_text, oov_threshold, single_decoded_view, single_english_text],
                    outputs=[single_html, single_stats],
                )

            # --- Compare tab ---
            with gr.TabItem("Compare"):
                compare_text = gr.Textbox(
                    label="Input Text",
                    placeholder="Type text to compare tokenizers...",
                    lines=3,
                )
                with gr.Row():
                    cmp_model_a = gr.Dropdown(
                        choices=tokenizer_names,
                        value=tokenizer_names[0],
                        label="Tokenizer A",
                    )
                    cmp_model_b = gr.Dropdown(
                        choices=tokenizer_names,
                        value=tokenizer_names[1] if len(tokenizer_names) > 1 else tokenizer_names[0],
                        label="Tokenizer B",
                    )
                compare_english_text = gr.Textbox(
                    label="English Equivalent (optional)",
                    placeholder="Paste English translation for RTC comparison...",
                    lines=2,
                )
                compare_btn = gr.Button("Compare", variant="primary")
                with gr.Row():
                    cmp_html_a = gr.HTML(label="Tokenizer A")
                    cmp_html_b = gr.HTML(label="Tokenizer B")
                compare_decoded_view = gr.Checkbox(
                    label="Readable token view (decode tokens, hide special tokens)",
                    value=False,
                )
                cmp_ratio_md = gr.Markdown(label="Comparison")

                compare_btn.click(
                    fn=_handle_compare,
                    inputs=[compare_text, cmp_model_a, cmp_model_b, compare_decoded_view, compare_english_text],
                    outputs=[cmp_html_a, cmp_html_b, cmp_ratio_md],
                )

    return demo