anthonym21's picture
Upload json_tokenizer/__init__.py with huggingface_hub
d54e053 verified
"""
json_tokenizer — A tokenizer optimized for JSON structures.
Architecture:
- Structural tokens: single-token representations for JSON grammar ({, }, [, ], :, ,)
- Key tokens: deduplicated key vocabulary with Key() wrapper
- Value BPE: byte-pair encoding trained on JSON string/number values
- Type tokens: explicit type markers for faithful roundtrip encoding
Delivers 5-15% fewer tokens than cl100k_base on schema-repetitive JSON
with a 90x smaller vocabulary, and lossless roundtrip fidelity.
"""
from json_tokenizer.tokenizer import JSONTokenizer
from json_tokenizer.bpe import BPETrainer
__version__ = "0.2.0"
__all__ = ["JSONTokenizer", "BPETrainer"]
try:
from json_tokenizer.hf_compat import JSONPreTrainedTokenizer
__all__.append("JSONPreTrainedTokenizer")
except ImportError:
pass # transformers not installed