| """Token counting utilities using tiktoken.""" |
|
|
| from typing import Any |
|
|
| import tiktoken |
|
|
|
|
| _encoder = None |
|
|
|
|
| def _get_encoder(): |
| global _encoder |
| if _encoder is None: |
| _encoder = tiktoken.get_encoding("cl100k_base") |
| return _encoder |
|
|
|
|
| def estimate_tokens(text: str) -> int: |
| """Estimate the number of tokens in a text string.""" |
| return len(_get_encoder().encode(text)) |
|
|
|
|
| def estimate_messages_tokens(messages: list[dict[str, Any]]) -> int: |
| """Estimate total tokens across a list of messages. |
| |
| Each message contributes its content tokens plus a small overhead |
| for role and message framing (~4 tokens per message). |
| """ |
| total = 0 |
| for msg in messages: |
| content = msg.get("content", "") |
| if isinstance(content, str): |
| total += estimate_tokens(content) + 4 |
| elif isinstance(content, list): |
| |
| for block in content: |
| if isinstance(block, dict): |
| if block.get("type") == "text": |
| total += estimate_tokens(block.get("text", "")) |
| elif block.get("type") == "tool_use": |
| total += estimate_tokens(str(block.get("input", {}))) |
| elif block.get("type") == "tool_result": |
| total += estimate_tokens(str(block.get("content", ""))) |
| else: |
| total += estimate_tokens(str(block)) |
| else: |
| |
| total += estimate_tokens(str(block)) |
| total += 4 |
| return total |
|
|