|
|
| """ |
| CRAYON UNIVERSAL TOKENIZER DEMO |
| =============================== |
| This script demonstrates the production-ready Crayon Tokenizer API. |
| It is designed to work seamlessly across: |
| - Local Machine (Windows/Linux/Mac) |
| - Google Colab / Jupyter Notebooks |
| - CPU, NVIDIA GPU (CUDA), and AMD GPU (ROCm) |
| """ |
|
|
| import os |
| import sys |
| from pathlib import Path |
|
|
| |
| |
| REPO_ROOT = Path(__file__).resolve().parent |
| SRC_PATH = REPO_ROOT / "src" |
| if SRC_PATH.exists(): |
| sys.path.insert(0, str(SRC_PATH)) |
|
|
| try: |
| from crayon import CrayonVocab |
| from crayon.core.vocabulary import enable_verbose_logging |
| except ImportError: |
| print("β Error: CRAYON source not found. Make sure you are running this from the repo root.") |
| sys.exit(1) |
|
|
| def run_universal_demo(): |
| |
| |
|
|
| print("=" * 60) |
| print("π CRAYON: UNIVERSAL TOKENIZATION DEMO") |
| print("=" * 60) |
|
|
| |
| |
| print("\n[STEP 1] Initializing Engine (Auto-Detecting Hardware)...") |
| try: |
| vocab = CrayonVocab(device="auto") |
| info = vocab.get_info() |
| |
| hw_name = info.get("hardware", {}).get("name", "Unknown") |
| hw_feat = info.get("hardware", {}).get("features", "") |
| print(f" β Device: {info['device'].upper()}") |
| print(f" β Backend: {info['backend']}") |
| print(f" β Hardware: {hw_name} [{hw_feat}]") |
| except Exception as e: |
| print(f" β Initialization failed: {e}") |
| return |
|
|
| |
| |
| print(f"\n[STEP 2] Loading 'lite' profile...") |
| try: |
| vocab.load_profile("lite") |
| print(f" β Profile Loaded: {vocab.current_profile_path}") |
| print(f" β Vocabulary Size: {vocab.vocab_size:,} tokens") |
| except Exception as e: |
| print(f" β Load failed: {e}") |
| print(" (Note: If you haven't built/downloaded the profiles, run train_code_profile.py first)") |
| return |
|
|
| |
| text = ( |
| "CRAYON is a hyper-fast tokenizer designed for modern AI. " |
| "It supports AVX2 on CPUs, CUDA on NVIDIA, and ROCm on AMD." |
| ) |
| |
| print(f"\n[STEP 3] Tokenizing Text...") |
| print(f" Input: \"{text[:50]}...\"") |
| |
| |
| tokens = vocab.tokenize(text) |
| |
| print(f" Result: {tokens[:10]}... ({len(tokens)} tokens)") |
| |
| |
| print(f"\n[STEP 4] Decoding back to text...") |
| try: |
| decoded = vocab.decode(tokens) |
| print(f" Output: \"{decoded[:50]}...\"") |
| |
| |
| |
| if decoded.strip().lower() == text.strip().lower(): |
| print("\nβ
SUCCESS: Tokenization and Decoding were perfect!") |
| else: |
| print("\nβΉοΈ INFO: Tokenization complete (approximate reconstruction).") |
| |
| except Exception as e: |
| print(f" β Decode failed: {e}") |
|
|
| print("\n" + "=" * 60) |
| print("DEMO COMPLETE - CRAYON IS READY FOR PRODUCTION") |
| print("=" * 60) |
|
|
| if __name__ == "__main__": |
| run_universal_demo() |
|
|