File size: 5,655 Bytes

708f4a3

"""
XERV Crayon: Production-Grade Omni-Backend Tokenizer
=====================================================

A high-performance tokenizer achieving >2M tokens/s via:
- AVX2/AVX-512 SIMD optimizations (CPU)
- NVIDIA CUDA kernels (GPU)
- AMD ROCm/HIP kernels (GPU)
- Entropy-guided vocabulary construction
- Cache-aligned Double-Array Trie data structures

Quick Start:
    >>> from crayon import CrayonVocab
    >>> 
    >>> # Auto-detect best device (GPU if available, else CPU)
    >>> vocab = CrayonVocab(device="auto")
    >>> vocab.load_profile("lite")
    >>> tokens = vocab.tokenize("Hello, world!")
    >>> 
    >>> # Batch processing
    >>> batch_tokens = vocab.tokenize(["text 1", "text 2", "text 3"])
    >>> 
    >>> # Decode back to text
    >>> text = vocab.decode(tokens)

Device Selection:
    >>> vocab = CrayonVocab(device="cpu")   # Force CPU (lowest latency)
    >>> vocab = CrayonVocab(device="cuda")  # Force NVIDIA GPU
    >>> vocab = CrayonVocab(device="rocm")  # Force AMD GPU
    >>> vocab = CrayonVocab(device="auto")  # Auto-detect best

Profile Management:
    >>> vocab.load_profile("lite")      # General purpose
    >>> vocab.load_profile("standard")  # Larger general purpose
    >>> 
    >>> # Context manager for temporary switch
    >>> with vocab.using_profile("standard"):
    ...     tokens = vocab.tokenize(source_code)

Environment Variables:
    CRAYON_DEVICE: Override device selection (cpu|cuda|rocm)
    CRAYON_PROFILE_DIR: Custom profile search directory
"""

from __future__ import annotations

__version__ = "5.1.0"
__author__ = "Xerv Research Engineering Division"

# ============================================================================
# CORE IMPORTS
# ============================================================================

from .core.tokenizer import crayon_tokenize
from .core.vocabulary import (
    CrayonVocab,
    DeviceType,
    DeviceState,
    HardwareInfo,
    quick_tokenize,
    enable_verbose_logging,
    disable_verbose_logging,
)

# ============================================================================
# OPTIONAL IMPORTS (May not be available in minimal installs)
# ============================================================================

try:
    from .concurrency.pipeline import PipelineTokenizer
except ImportError:
    PipelineTokenizer = None  # type: ignore

try:
    from .memory.zerocopy import ZeroCopyTokenizer
except ImportError:
    ZeroCopyTokenizer = None  # type: ignore

try:
    from .training import train_vocabulary, build_default_vocabulary
except ImportError:
    train_vocabulary = None  # type: ignore
    build_default_vocabulary = None  # type: ignore


# ============================================================================
# BACKEND UTILITIES
# ============================================================================

def get_version() -> str:
    """Return the package version string."""
    return __version__


def check_c_extension() -> bool:
    """
    Check if the core C extension is available.
    
    Returns:
        True if crayon_cpu extension is loaded and functional.
    """
    try:
        from .c_ext import crayon_cpu
        return hasattr(crayon_cpu, 'tokenize') and hasattr(crayon_cpu, 'load_dat')
    except ImportError:
        return False


def check_backends() -> dict:
    """
    Check availability of all backends.
    
    Returns:
        Dictionary with status for cpu, cuda, and rocm backends.
        
    Example:
        >>> from crayon import check_backends
        >>> backends = check_backends()
        >>> print(backends)
        {'cpu': True, 'cuda': True, 'rocm': False}
    """
    try:
        from .c_ext import is_cuda_available, is_rocm_available
        return {
            "cpu": check_c_extension(),
            "cuda": is_cuda_available(),
            "rocm": is_rocm_available(),
        }
    except ImportError:
        return {
            "cpu": check_c_extension(),
            "cuda": False,
            "rocm": False,
        }


def get_backend_info() -> dict:
    """
    Get detailed information about all backends.
    
    Returns:
        Dictionary with availability, hardware info, and errors for each backend.
    """
    try:
        from .c_ext import get_backend_info as _get_backend_info
        return _get_backend_info()
    except ImportError:
        return {"cpu": {"available": check_c_extension()}}


def check_resources() -> dict:
    """
    Check availability of optional resources for vocabulary building.
    
    Returns:
        Dictionary with availability status for each resource type.
    """
    try:
        from .resources import check_resource_availability
        return check_resource_availability()
    except ImportError:
        return {
            "requests_available": False,
            "huggingface_available": False,
            "builtin_available": True
        }


# ============================================================================
# PUBLIC API
# ============================================================================

__all__ = [
    # Version
    "__version__",
    "__author__",
    "get_version",
    
    # Core
    "CrayonVocab",
    "crayon_tokenize",
    "quick_tokenize",
    "DeviceType",
    "DeviceState",
    "HardwareInfo",
    
    # Logging
    "enable_verbose_logging",
    "disable_verbose_logging",
    
    # Backend checks
    "check_c_extension",
    "check_backends",
    "get_backend_info",
    "check_resources",
    
    # Optional modules (may be None)
    "PipelineTokenizer",
    "ZeroCopyTokenizer",
    "train_vocabulary",
    "build_default_vocabulary",
]