File size: 5,655 Bytes
708f4a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""
XERV Crayon: Production-Grade Omni-Backend Tokenizer
=====================================================

A high-performance tokenizer achieving >2M tokens/s via:
- AVX2/AVX-512 SIMD optimizations (CPU)
- NVIDIA CUDA kernels (GPU)
- AMD ROCm/HIP kernels (GPU)
- Entropy-guided vocabulary construction
- Cache-aligned Double-Array Trie data structures

Quick Start:
    >>> from crayon import CrayonVocab
    >>> 
    >>> # Auto-detect best device (GPU if available, else CPU)
    >>> vocab = CrayonVocab(device="auto")
    >>> vocab.load_profile("lite")
    >>> tokens = vocab.tokenize("Hello, world!")
    >>> 
    >>> # Batch processing
    >>> batch_tokens = vocab.tokenize(["text 1", "text 2", "text 3"])
    >>> 
    >>> # Decode back to text
    >>> text = vocab.decode(tokens)

Device Selection:
    >>> vocab = CrayonVocab(device="cpu")   # Force CPU (lowest latency)
    >>> vocab = CrayonVocab(device="cuda")  # Force NVIDIA GPU
    >>> vocab = CrayonVocab(device="rocm")  # Force AMD GPU
    >>> vocab = CrayonVocab(device="auto")  # Auto-detect best

Profile Management:
    >>> vocab.load_profile("lite")      # General purpose
    >>> vocab.load_profile("standard")  # Larger general purpose
    >>> 
    >>> # Context manager for temporary switch
    >>> with vocab.using_profile("standard"):
    ...     tokens = vocab.tokenize(source_code)

Environment Variables:
    CRAYON_DEVICE: Override device selection (cpu|cuda|rocm)
    CRAYON_PROFILE_DIR: Custom profile search directory
"""

from __future__ import annotations

__version__ = "5.1.0"
__author__ = "Xerv Research Engineering Division"

# ============================================================================
# CORE IMPORTS
# ============================================================================

from .core.tokenizer import crayon_tokenize
from .core.vocabulary import (
    CrayonVocab,
    DeviceType,
    DeviceState,
    HardwareInfo,
    quick_tokenize,
    enable_verbose_logging,
    disable_verbose_logging,
)

# ============================================================================
# OPTIONAL IMPORTS (May not be available in minimal installs)
# ============================================================================

try:
    from .concurrency.pipeline import PipelineTokenizer
except ImportError:
    PipelineTokenizer = None  # type: ignore

try:
    from .memory.zerocopy import ZeroCopyTokenizer
except ImportError:
    ZeroCopyTokenizer = None  # type: ignore

try:
    from .training import train_vocabulary, build_default_vocabulary
except ImportError:
    train_vocabulary = None  # type: ignore
    build_default_vocabulary = None  # type: ignore


# ============================================================================
# BACKEND UTILITIES
# ============================================================================

def get_version() -> str:
    """Return the package version string."""
    return __version__


def check_c_extension() -> bool:
    """
    Check if the core C extension is available.
    
    Returns:
        True if crayon_cpu extension is loaded and functional.
    """
    try:
        from .c_ext import crayon_cpu
        return hasattr(crayon_cpu, 'tokenize') and hasattr(crayon_cpu, 'load_dat')
    except ImportError:
        return False


def check_backends() -> dict:
    """
    Check availability of all backends.
    
    Returns:
        Dictionary with status for cpu, cuda, and rocm backends.
        
    Example:
        >>> from crayon import check_backends
        >>> backends = check_backends()
        >>> print(backends)
        {'cpu': True, 'cuda': True, 'rocm': False}
    """
    try:
        from .c_ext import is_cuda_available, is_rocm_available
        return {
            "cpu": check_c_extension(),
            "cuda": is_cuda_available(),
            "rocm": is_rocm_available(),
        }
    except ImportError:
        return {
            "cpu": check_c_extension(),
            "cuda": False,
            "rocm": False,
        }


def get_backend_info() -> dict:
    """
    Get detailed information about all backends.
    
    Returns:
        Dictionary with availability, hardware info, and errors for each backend.
    """
    try:
        from .c_ext import get_backend_info as _get_backend_info
        return _get_backend_info()
    except ImportError:
        return {"cpu": {"available": check_c_extension()}}


def check_resources() -> dict:
    """
    Check availability of optional resources for vocabulary building.
    
    Returns:
        Dictionary with availability status for each resource type.
    """
    try:
        from .resources import check_resource_availability
        return check_resource_availability()
    except ImportError:
        return {
            "requests_available": False,
            "huggingface_available": False,
            "builtin_available": True
        }


# ============================================================================
# PUBLIC API
# ============================================================================

__all__ = [
    # Version
    "__version__",
    "__author__",
    "get_version",
    
    # Core
    "CrayonVocab",
    "crayon_tokenize",
    "quick_tokenize",
    "DeviceType",
    "DeviceState",
    "HardwareInfo",
    
    # Logging
    "enable_verbose_logging",
    "disable_verbose_logging",
    
    # Backend checks
    "check_c_extension",
    "check_backends",
    "get_backend_info",
    "check_resources",
    
    # Optional modules (may be None)
    "PipelineTokenizer",
    "ZeroCopyTokenizer",
    "train_vocabulary",
    "build_default_vocabulary",
]