File size: 707 Bytes
708f4a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
"""
Crayon Core Module.

Contains the fundamental algorithms and data structures for tokenization:
1. Tokenizer (The algorithmic driver)
2. Vocabulary (The data structure)
3. Primitives (Metadata structures)
4. Vocab Builder (Entropy-guided construction)
"""

from .tokenizer import crayon_tokenize
from .vocabulary import CrayonVocab
from .primitives import TokenMetadata
from .vocab_builder import (
    EntropyVocabBuilder,
    construct_optimal_vocabulary,
    deterministic_sort_key,
    assign_stable_ids
)

__all__ = [
    "crayon_tokenize",
    "CrayonVocab",
    "TokenMetadata",
    "EntropyVocabBuilder",
    "construct_optimal_vocabulary",
    "deterministic_sort_key",
    "assign_stable_ids"
]