Spaces:

XQ
/

Dokumentassistent

Running

File size: 3,050 Bytes

4d2a2da

"""Token counting and budget tracking for LLM prompts.

Stage 1 (current): measure-only — counts tokens at known prompt injection
points and logs them. No truncation is performed.

Token counts are estimates: tiktoken's cl100k tokenizer is used as a
provider-agnostic baseline, multiplied by a safety factor because non-OpenAI
multilingual tokenizers (Llama, Gemma, Mistral) typically tokenize Danish /
mixed-language text 20-40% more aggressively than cl100k.

Provider-specific tokenizers (Ollama's /api/tokenize, HuggingFace AutoTokenizer)
are intentionally not used here to keep this module dependency-free and
process-local. When real usage data exposes the gap, swap in a
provider-aware backend.
"""

from __future__ import annotations

import logging

logger = logging.getLogger(__name__)

# Conservative scaling: cl100k under-counts multilingual / Danish text.
# 1.5× keeps us on the safe side for budget decisions.
_DEFAULT_SAFETY_FACTOR = 1.5

# Fallback when tiktoken is unavailable: ~4 characters per token is the
# common rule of thumb for English; multiplied by safety factor it's
# usable as a coarse upper bound.
_CHARS_PER_TOKEN_FALLBACK = 4

try:
    import tiktoken

    _ENCODER = tiktoken.get_encoding("cl100k_base")
except Exception:  # noqa: BLE001 — any tiktoken failure → heuristic
    _ENCODER = None
    logger.warning("tiktoken unavailable; falling back to character-based token estimation")


def count_tokens(text: str, *, safety_factor: float = _DEFAULT_SAFETY_FACTOR) -> int:
    """Estimate the token count of ``text``.

    Args:
        text: Text to measure. Empty / None-ish input returns 0.
        safety_factor: Multiplier applied to the raw count to compensate
            for non-OpenAI tokenizers being more aggressive on multilingual
            text. Defaults to 1.5×.

    Returns:
        Estimated token count, rounded up to the nearest int.
    """
    if not text:
        return 0
    if _ENCODER is not None:
        raw = len(_ENCODER.encode(text, disallowed_special=()))
    else:
        raw = max(1, len(text) // _CHARS_PER_TOKEN_FALLBACK)
    return int(raw * safety_factor + 0.5)


def measure(
    prompt_name: str,
    text: str,
    *,
    enabled: bool = True,
    safety_factor: float = _DEFAULT_SAFETY_FACTOR,
) -> int:
    """Count tokens for ``text`` and log the result.

    Args:
        prompt_name: Logical name of the prompt being measured (used in
            log lines so different injection points are easy to grep).
        text: The fully-rendered prompt string.
        enabled: When False, returns 0 immediately and logs nothing —
            lets callers gate on the ``TOKEN_BUDGET_ENABLED`` flag without
            duplicating the check.
        safety_factor: See :func:`count_tokens`.

    Returns:
        Estimated token count, or 0 when ``enabled`` is False.
    """
    if not enabled:
        return 0
    count = count_tokens(text, safety_factor=safety_factor)
    logger.info("token_budget prompt=%s tokens~=%d", prompt_name, count)
    return count