File size: 5,655 Bytes
708f4a3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | """
XERV Crayon: Production-Grade Omni-Backend Tokenizer
=====================================================
A high-performance tokenizer achieving >2M tokens/s via:
- AVX2/AVX-512 SIMD optimizations (CPU)
- NVIDIA CUDA kernels (GPU)
- AMD ROCm/HIP kernels (GPU)
- Entropy-guided vocabulary construction
- Cache-aligned Double-Array Trie data structures
Quick Start:
>>> from crayon import CrayonVocab
>>>
>>> # Auto-detect best device (GPU if available, else CPU)
>>> vocab = CrayonVocab(device="auto")
>>> vocab.load_profile("lite")
>>> tokens = vocab.tokenize("Hello, world!")
>>>
>>> # Batch processing
>>> batch_tokens = vocab.tokenize(["text 1", "text 2", "text 3"])
>>>
>>> # Decode back to text
>>> text = vocab.decode(tokens)
Device Selection:
>>> vocab = CrayonVocab(device="cpu") # Force CPU (lowest latency)
>>> vocab = CrayonVocab(device="cuda") # Force NVIDIA GPU
>>> vocab = CrayonVocab(device="rocm") # Force AMD GPU
>>> vocab = CrayonVocab(device="auto") # Auto-detect best
Profile Management:
>>> vocab.load_profile("lite") # General purpose
>>> vocab.load_profile("standard") # Larger general purpose
>>>
>>> # Context manager for temporary switch
>>> with vocab.using_profile("standard"):
... tokens = vocab.tokenize(source_code)
Environment Variables:
CRAYON_DEVICE: Override device selection (cpu|cuda|rocm)
CRAYON_PROFILE_DIR: Custom profile search directory
"""
from __future__ import annotations
__version__ = "5.1.0"
__author__ = "Xerv Research Engineering Division"
# ============================================================================
# CORE IMPORTS
# ============================================================================
from .core.tokenizer import crayon_tokenize
from .core.vocabulary import (
CrayonVocab,
DeviceType,
DeviceState,
HardwareInfo,
quick_tokenize,
enable_verbose_logging,
disable_verbose_logging,
)
# ============================================================================
# OPTIONAL IMPORTS (May not be available in minimal installs)
# ============================================================================
try:
from .concurrency.pipeline import PipelineTokenizer
except ImportError:
PipelineTokenizer = None # type: ignore
try:
from .memory.zerocopy import ZeroCopyTokenizer
except ImportError:
ZeroCopyTokenizer = None # type: ignore
try:
from .training import train_vocabulary, build_default_vocabulary
except ImportError:
train_vocabulary = None # type: ignore
build_default_vocabulary = None # type: ignore
# ============================================================================
# BACKEND UTILITIES
# ============================================================================
def get_version() -> str:
"""Return the package version string."""
return __version__
def check_c_extension() -> bool:
"""
Check if the core C extension is available.
Returns:
True if crayon_cpu extension is loaded and functional.
"""
try:
from .c_ext import crayon_cpu
return hasattr(crayon_cpu, 'tokenize') and hasattr(crayon_cpu, 'load_dat')
except ImportError:
return False
def check_backends() -> dict:
"""
Check availability of all backends.
Returns:
Dictionary with status for cpu, cuda, and rocm backends.
Example:
>>> from crayon import check_backends
>>> backends = check_backends()
>>> print(backends)
{'cpu': True, 'cuda': True, 'rocm': False}
"""
try:
from .c_ext import is_cuda_available, is_rocm_available
return {
"cpu": check_c_extension(),
"cuda": is_cuda_available(),
"rocm": is_rocm_available(),
}
except ImportError:
return {
"cpu": check_c_extension(),
"cuda": False,
"rocm": False,
}
def get_backend_info() -> dict:
"""
Get detailed information about all backends.
Returns:
Dictionary with availability, hardware info, and errors for each backend.
"""
try:
from .c_ext import get_backend_info as _get_backend_info
return _get_backend_info()
except ImportError:
return {"cpu": {"available": check_c_extension()}}
def check_resources() -> dict:
"""
Check availability of optional resources for vocabulary building.
Returns:
Dictionary with availability status for each resource type.
"""
try:
from .resources import check_resource_availability
return check_resource_availability()
except ImportError:
return {
"requests_available": False,
"huggingface_available": False,
"builtin_available": True
}
# ============================================================================
# PUBLIC API
# ============================================================================
__all__ = [
# Version
"__version__",
"__author__",
"get_version",
# Core
"CrayonVocab",
"crayon_tokenize",
"quick_tokenize",
"DeviceType",
"DeviceState",
"HardwareInfo",
# Logging
"enable_verbose_logging",
"disable_verbose_logging",
# Backend checks
"check_c_extension",
"check_backends",
"get_backend_info",
"check_resources",
# Optional modules (may be None)
"PipelineTokenizer",
"ZeroCopyTokenizer",
"train_vocabulary",
"build_default_vocabulary",
] |