File size: 3,435 Bytes
708f4a3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
"""
CRAYON UNIVERSAL TOKENIZER DEMO
===============================
This script demonstrates the production-ready Crayon Tokenizer API.
It is designed to work seamlessly across:
- Local Machine (Windows/Linux/Mac)
- Google Colab / Jupyter Notebooks
- CPU, NVIDIA GPU (CUDA), and AMD GPU (ROCm)
"""
import os
import sys
from pathlib import Path
# --- 1. Environment Setup ---
# Add 'src' to path so we can run without installing the package
REPO_ROOT = Path(__file__).resolve().parent
SRC_PATH = REPO_ROOT / "src"
if SRC_PATH.exists():
sys.path.insert(0, str(SRC_PATH))
try:
from crayon import CrayonVocab
from crayon.core.vocabulary import enable_verbose_logging
except ImportError:
print("โ Error: CRAYON source not found. Make sure you are running this from the repo root.")
sys.exit(1)
def run_universal_demo():
# Optional: Enable verbose logging to see hardware detection in action
# enable_verbose_logging()
print("=" * 60)
print("๐ CRAYON: UNIVERSAL TOKENIZATION DEMO")
print("=" * 60)
# --- 2. Initialize Engine ---
# device="auto" automatically picks CUDA > ROCm > CPU
print("\n[STEP 1] Initializing Engine (Auto-Detecting Hardware)...")
try:
vocab = CrayonVocab(device="auto")
info = vocab.get_info()
hw_name = info.get("hardware", {}).get("name", "Unknown")
hw_feat = info.get("hardware", {}).get("features", "")
print(f" โ Device: {info['device'].upper()}")
print(f" โ Backend: {info['backend']}")
print(f" โ Hardware: {hw_name} [{hw_feat}]")
except Exception as e:
print(f" โ Initialization failed: {e}")
return
# --- 3. Load Profile ---
# We load 'lite' which is bundled with the repository
print(f"\n[STEP 2] Loading 'lite' profile...")
try:
vocab.load_profile("lite")
print(f" โ Profile Loaded: {vocab.current_profile_path}")
print(f" โ Vocabulary Size: {vocab.vocab_size:,} tokens")
except Exception as e:
print(f" โ Load failed: {e}")
print(" (Note: If you haven't built/downloaded the profiles, run train_code_profile.py first)")
return
# --- 4. Performance Tokenization ---
text = (
"CRAYON is a hyper-fast tokenizer designed for modern AI. "
"It supports AVX2 on CPUs, CUDA on NVIDIA, and ROCm on AMD."
)
print(f"\n[STEP 3] Tokenizing Text...")
print(f" Input: \"{text[:50]}...\"")
# Tokenize (returns a list of IDs)
tokens = vocab.tokenize(text)
print(f" Result: {tokens[:10]}... ({len(tokens)} tokens)")
# --- 5. Reconstruction (Decoding) ---
print(f"\n[STEP 4] Decoding back to text...")
try:
decoded = vocab.decode(tokens)
print(f" Output: \"{decoded[:50]}...\"")
# Verify success
# Note: BPE usually preserves whitespace and case
if decoded.strip().lower() == text.strip().lower():
print("\nโ
SUCCESS: Tokenization and Decoding were perfect!")
else:
print("\nโน๏ธ INFO: Tokenization complete (approximate reconstruction).")
except Exception as e:
print(f" โ Decode failed: {e}")
print("\n" + "=" * 60)
print("DEMO COMPLETE - CRAYON IS READY FOR PRODUCTION")
print("=" * 60)
if __name__ == "__main__":
run_universal_demo()
|