CRAYON-tokenizer / src /crayon /core /__init__.py
Phase-Technologies's picture
Upload folder using huggingface_hub
708f4a3 verified
"""
Crayon Core Module.
Contains the fundamental algorithms and data structures for tokenization:
1. Tokenizer (The algorithmic driver)
2. Vocabulary (The data structure)
3. Primitives (Metadata structures)
4. Vocab Builder (Entropy-guided construction)
"""
from .tokenizer import crayon_tokenize
from .vocabulary import CrayonVocab
from .primitives import TokenMetadata
from .vocab_builder import (
EntropyVocabBuilder,
construct_optimal_vocabulary,
deterministic_sort_key,
assign_stable_ids
)
__all__ = [
"crayon_tokenize",
"CrayonVocab",
"TokenMetadata",
"EntropyVocabBuilder",
"construct_optimal_vocabulary",
"deterministic_sort_key",
"assign_stable_ids"
]