CRAYON-tokenizer / test_readme_examples.py
Phase-Technologies's picture
Upload folder using huggingface_hub
708f4a3 verified
"""
Test all code examples from README.md to ensure they work correctly.
"""
import sys
import os
# Add paths
sys.path.insert(0, os.path.join(os.getcwd(), "build", "lib.win-amd64-cpython-313"))
sys.path.insert(0, os.path.join(os.getcwd(), "src"))
print("=" * 70)
print("TESTING README CODE EXAMPLES")
print("=" * 70)
print()
# Test 1: Quick Start Example
print("[TEST 1] Quick Start - Load Profile and Tokenize")
print("-" * 70)
try:
from crayon.core.vocabulary import CrayonVocab
vocab = CrayonVocab(device="auto")
vocab.load_profile("lite")
# Tokenize specialized syntax
code_snippet = "fn main() { println!(\"Hello, World!\"); }"
tokens = vocab.tokenize(code_snippet)
# Check if decode works
try:
decoded = vocab.decode(tokens)
print(f"βœ“ Tokenize: {code_snippet}")
print(f"βœ“ Tokens: {tokens}")
print(f"βœ“ Decoded: {decoded}")
print("βœ“ TEST PASSED")
except AttributeError:
print(f"⚠ WARNING: vocab.decode() not implemented yet")
print(f"βœ“ Tokenize works: {tokens}")
print("βœ“ TEST PARTIALLY PASSED")
except Exception as e:
print(f"βœ— TEST FAILED: {e}")
import traceback
traceback.print_exc()
print()
# Test 2: Load different profiles
print("[TEST 2] Load Different Profiles")
print("-" * 70)
for profile_name in ["lite", "standard"]:
try:
vocab = CrayonVocab(device="auto")
vocab.load_profile(profile_name)
print(f"βœ“ Loaded '{profile_name}' profile")
except Exception as e:
print(f"βœ— Failed to load '{profile_name}': {e}")
print()
# Test 3: DAT Builder Example
print("[TEST 3] Compile Vocabulary to DAT Format")
print("-" * 70)
try:
from crayon.c_ext.dat_builder import DATBuilder
import json
import tempfile
# Use a small test vocab
test_vocab = ["hello", "world", "test", "python"]
# Compile to DAT
builder = DATBuilder()
builder.build(test_vocab)
# Save to temp file
dat_path = os.path.join(tempfile.gettempdir(), "test_readme.dat")
builder.save(dat_path)
print(f"βœ“ Built DAT with {builder.size} nodes")
print(f"βœ“ Saved to {dat_path}")
os.unlink(dat_path)
print("βœ“ TEST PASSED")
except Exception as e:
print(f"βœ— TEST FAILED: {e}")
import traceback
traceback.print_exc()
print()
# Test 4: Direct C++ Engine Access
print("[TEST 4] Direct C++ Engine Access")
print("-" * 70)
try:
import mmap
from crayon.c_ext import crayon_fast
from crayon.c_ext.dat_builder import DATBuilder
import tempfile
# Build a small DAT
test_vocab = ["the", "quick", "brown", "fox"]
builder = DATBuilder()
builder.build(test_vocab)
dat_path = os.path.join(tempfile.gettempdir(), "test_engine.dat")
builder.save(dat_path)
# Zero-copy load via mmap
with open(dat_path, "rb") as f:
mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
size = crayon_fast.load_dat(mm)
# Ultra-fast tokenization
tokens = crayon_fast.tokenize("the quick brown fox")
print(f"βœ“ Loaded DAT: {size} nodes")
print(f"βœ“ Tokenized: {tokens}")
os.unlink(dat_path)
print("βœ“ TEST PASSED")
except Exception as e:
print(f"βœ— TEST FAILED: {e}")
import traceback
traceback.print_exc()
print()
print("=" * 70)
print("README CODE TESTS COMPLETE")
print("=" * 70)