from __future__ import annotations from dataclasses import dataclass, field import tiktoken _ENCODING_CACHE: dict[str, tiktoken.Encoding] = {} def _get_encoding(name: str = "o200k_base") -> tiktoken.Encoding: if name not in _ENCODING_CACHE: _ENCODING_CACHE[name] = tiktoken.get_encoding(name) return _ENCODING_CACHE[name] def count_tokens(text: str, encoding_name: str = "o200k_base") -> int: if not text: return 0 return len(_get_encoding(encoding_name).encode(text, disallowed_special=())) @dataclass(frozen=True) class ConversionResult: source: str markdown: str char_count: int word_count: int line_count: int duration_ms: float file_size_bytes: int mime_type: str content_hash: str metadata: dict = field(default_factory=dict) @property def token_estimate(self) -> int: return max(1, count_tokens(self.markdown)) @dataclass(frozen=True) class ConversionError: source: str error_type: str message: str duration_ms: float