llm-ready-data / app /models /domain.py
light-infer-chat's picture
ok
25bbb06
Raw
History Blame Contribute Delete
1.04 kB
from __future__ import annotations
from dataclasses import dataclass, field
import tiktoken
_ENCODING_CACHE: dict[str, tiktoken.Encoding] = {}
def _get_encoding(name: str = "o200k_base") -> tiktoken.Encoding:
if name not in _ENCODING_CACHE:
_ENCODING_CACHE[name] = tiktoken.get_encoding(name)
return _ENCODING_CACHE[name]
def count_tokens(text: str, encoding_name: str = "o200k_base") -> int:
if not text:
return 0
return len(_get_encoding(encoding_name).encode(text, disallowed_special=()))
@dataclass(frozen=True)
class ConversionResult:
source: str
markdown: str
char_count: int
word_count: int
line_count: int
duration_ms: float
file_size_bytes: int
mime_type: str
content_hash: str
metadata: dict = field(default_factory=dict)
@property
def token_estimate(self) -> int:
return max(1, count_tokens(self.markdown))
@dataclass(frozen=True)
class ConversionError:
source: str
error_type: str
message: str
duration_ms: float