Spaces:
Running
Running
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| import tiktoken | |
| _ENCODING_CACHE: dict[str, tiktoken.Encoding] = {} | |
| def _get_encoding(name: str = "o200k_base") -> tiktoken.Encoding: | |
| if name not in _ENCODING_CACHE: | |
| _ENCODING_CACHE[name] = tiktoken.get_encoding(name) | |
| return _ENCODING_CACHE[name] | |
| def count_tokens(text: str, encoding_name: str = "o200k_base") -> int: | |
| if not text: | |
| return 0 | |
| return len(_get_encoding(encoding_name).encode(text, disallowed_special=())) | |
| class ConversionResult: | |
| source: str | |
| markdown: str | |
| char_count: int | |
| word_count: int | |
| line_count: int | |
| duration_ms: float | |
| file_size_bytes: int | |
| mime_type: str | |
| content_hash: str | |
| metadata: dict = field(default_factory=dict) | |
| def token_estimate(self) -> int: | |
| return max(1, count_tokens(self.markdown)) | |
| class ConversionError: | |
| source: str | |
| error_type: str | |
| message: str | |
| duration_ms: float | |