CRAYON-tokenizer / Crayon_Colab_Notebook.py
Phase-Technologies's picture
Upload folder using huggingface_hub
708f4a3 verified
"""
XERV CRAYON V5.1.0 - Production Omni-Backend Tokenizer
=======================================================
Copy this ENTIRE script into a Google Colab cell and run it.
IMPORTANT: Enable GPU runtime first:
Runtime -> Change runtime type -> GPU (T4/V100/A100)
WHAT'S NEW in v4.3.0:
- Fixed ROCm/HIP compilation: Now properly uses hipcc instead of g++
- Full support for AMD GPUs (MI250/MI300, Radeon RX 7000+)
- Production-grade error handling across all backends
- Python 3.10-3.13 fully supported
"""
import subprocess
import sys
import os
import time
print("=" * 70)
print("XERV CRAYON V4.3.0 INSTALLATION AND BENCHMARKS")
print("=" * 70)
# 1. Environment Check
print("[1/7] Checking environment...")
try:
import torch
print(f" PyTorch: {torch.__version__}")
if torch.cuda.is_available():
print(f" CUDA: {torch.version.cuda} ({torch.cuda.get_device_name(0)})")
print(" * Smart Build: Will compile ONLY for this GPU architecture")
else:
print(" CUDA: Not available (CPU only)")
except ImportError:
print(" PyTorch not found (will be installed)")
# Check for NVCC (NVIDIA) or hipcc (AMD)
nvcc_check = subprocess.run(["which", "nvcc"], capture_output=True, text=True)
if nvcc_check.returncode == 0:
print(f" NVCC: {nvcc_check.stdout.strip()}")
else:
print(" NVCC: Not found")
hipcc_check = subprocess.run(["which", "hipcc"], capture_output=True, text=True)
if hipcc_check.returncode == 0:
print(f" HIPCC (ROCm): {hipcc_check.stdout.strip()}")
else:
print(" HIPCC (ROCm): Not found")
# 2. Build Dependencies
print("\n[2/7] Installing build dependencies...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "ninja", "packaging", "wheel", "setuptools>=68.0"])
print(" Done (ninja, packaging, wheel)")
# 3. Clean Old State
print("\n[3/7] Cleaning previous installations...")
os.system("pip uninstall -y xerv-crayon crayon 2>/dev/null")
os.system("rm -rf /tmp/crayon* build dist src/*.egg-info 2>/dev/null")
# 4. Clone Source
print("\n[4/7] Cloning source code...")
timestamp = int(time.time())
clone_dir = f"/tmp/crayon_{timestamp}"
cmd = f"git clone --depth 1 https://github.com/Electroiscoding/CRAYON.git {clone_dir}"
if os.system(cmd) != 0:
print(" FATAL: Git clone failed!")
sys.exit(1)
# Verify source
v_check = subprocess.run(["grep", "-m1", "__version__", f"{clone_dir}/src/crayon/__init__.py"],
capture_output=True, text=True)
print(f" {v_check.stdout.strip()}")
# 5. Build & Install (Streaming Output)
print("\n[5/7] Compiling and Installing (Streaming Logs)...")
print("-" * 70)
build_env = os.environ.copy()
build_env["MAX_JOBS"] = "1" # Force serial build to prevent OOM
build_env["CUDA_HOME"] = "/usr/local/cuda"
# ROCm is auto-detected via /opt/rocm
# Stream output line-by-line
cmd = [sys.executable, "-m", "pip", "install", "-v", "--no-build-isolation", clone_dir]
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=build_env, text=True)
# Print output while running
while True:
line = process.stdout.readline()
if not line and process.poll() is not None:
break
if line:
print(line.rstrip())
rc = process.poll()
print("-" * 70)
if rc != 0:
print("\n" + "!" * 70)
print("FATAL ERROR: Installation failed!")
print(f"Exit Code: {rc}")
print("!" * 70)
sys.exit(1)
# 6. Verification
print("\n[6/7] Verifying installation...")
# Reset module cache
for key in list(sys.modules.keys()):
if "crayon" in key:
del sys.modules[key]
try:
import crayon
print(f" Success! Installed version: {crayon.get_version()}")
backends = crayon.check_backends()
print(f" Backends: {backends}")
except ImportError as e:
print(f" FATAL: Could not import crayon: {e}")
sys.exit(1)
# 7. Benchmarks
print("\n" + "=" * 70)
print("BENCHMARKS & TESTING")
print("=" * 70)
from crayon import CrayonVocab
vocab = CrayonVocab(device="auto")
vocab.load_profile("lite")
print(f"\nActive Device: {vocab.device.upper()}")
info = vocab.get_info()
print(f"Backend: {info['backend']}")
if vocab.device == "cpu" and backends.get("cuda"):
print("NOTE: Running on CPU but CUDA is available. Use device='cuda' to force.")
if vocab.device == "cpu" and backends.get("rocm"):
print("NOTE: Running on CPU but ROCm is available. Use device='rocm' to force.")
# Throughput test
text = "The quick brown fox jumps over the lazy dog."
batch_sizes = [1000, 10000, 50000]
print("\nBatch Throughput:")
for bs in batch_sizes:
batch = [text] * bs
# Warmup
vocab.tokenize(batch[:10])
start = time.time()
res = vocab.tokenize(batch)
dur = time.time() - start
toks = sum(len(x) for x in res)
print(f" {bs:>8,} docs: {bs/dur:>12,.0f} docs/sec | {toks/dur:>14,.0f} tokens/sec")
print("\n" + "=" * 70)
print("INSTALLATION COMPLETE!")
print("=" * 70)
print("""
Quick Start:
from crayon import CrayonVocab
vocab = CrayonVocab(device='auto')
vocab.load_profile('lite')
tokens = vocab.tokenize("Hello, world!")
print(tokens)
Available Profiles: 'lite', 'code', 'science', 'multilingual', 'arts_commerce'
Available Devices: 'auto', 'cpu', 'cuda', 'rocm'
""")