Phase-Technologies's picture
Upload folder using huggingface_hub
708f4a3 verified
"""
XERV Crayon CLI - Command Line Interface
=========================================
Provides command-line tools for benchmarking and vocabulary management.
"""
import sys
import time
import argparse
def run_benchmark():
"""Run a quick benchmark of the Crayon tokenizer."""
parser = argparse.ArgumentParser(
prog='crayon-benchmark',
description='XERV Crayon Tokenizer Benchmark Tool'
)
parser.add_argument(
'--profile', '-p',
default='lite',
choices=['lite', 'standard'],
help='Vocabulary profile to use (default: lite)'
)
parser.add_argument(
'--iterations', '-n',
type=int,
default=10,
help='Number of benchmark iterations (default: 10)'
)
parser.add_argument(
'--text', '-t',
default=None,
help='Custom text to tokenize (default: built-in test text)'
)
args = parser.parse_args()
print("=" * 60)
print("XERV CRAYON TOKENIZER BENCHMARK")
print("=" * 60)
try:
from crayon import CrayonVocab
except ImportError as e:
print(f"[ERROR] Failed to import crayon: {e}")
print("Make sure xerv-crayon is properly installed.")
sys.exit(1)
# Load vocabulary
print(f"\n[INFO] Loading profile: {args.profile}")
start = time.perf_counter()
try:
vocab = CrayonVocab.load_profile(args.profile)
except Exception as e:
print(f"[ERROR] Failed to load profile: {e}")
sys.exit(1)
load_time = (time.perf_counter() - start) * 1000
if vocab.fast_mode:
print(f"[OK] Loaded with AVX2 engine ({load_time:.2f}ms)")
else:
print(f"[WARN] Loaded in fallback mode ({load_time:.2f}ms)")
# Prepare test text
if args.text:
test_text = args.text
else:
test_text = """
def matrix_multiply(A, B):
# Standard O(n^3) matrix multiplication
result = [[0 for _ in range(len(B[0]))] for _ in range(len(A))]
for i in range(len(A)):
for j in range(len(B[0])):
for k in range(len(B)):
result[i][j] += A[i][k] * B[k][j]
return result
The quick brown fox jumps over the lazy dog.
Machine learning models require efficient tokenization for optimal performance.
""" * 100 # Repeat for meaningful benchmark
text_size = len(test_text.encode('utf-8'))
print(f"\n[INFO] Test text size: {text_size:,} bytes ({text_size/1024:.1f} KB)")
print(f"[INFO] Iterations: {args.iterations}")
# Warmup
print("\n[INFO] Warming up...")
for _ in range(2):
_ = vocab.tokenize(test_text)
# Benchmark
print("[INFO] Running benchmark...")
times = []
token_counts = []
for i in range(args.iterations):
start = time.perf_counter()
tokens = vocab.tokenize(test_text)
elapsed = time.perf_counter() - start
times.append(elapsed)
token_counts.append(len(tokens))
# Calculate metrics
avg_time = sum(times) / len(times)
min_time = min(times)
max_time = max(times)
avg_tokens = sum(token_counts) / len(token_counts)
tokens_per_sec = avg_tokens / avg_time
mb_per_sec = (text_size / 1024 / 1024) / avg_time
# Print results
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f" Profile: {args.profile}")
print(f" Token Count: {int(avg_tokens):,}")
print(f" Tokens/sec: {tokens_per_sec:,.0f}")
print(f" MB/sec: {mb_per_sec:.2f}")
print(f" Avg Time: {avg_time*1000:.2f}ms")
print(f" Min Time: {min_time*1000:.2f}ms")
print(f" Max Time: {max_time*1000:.2f}ms")
print("=" * 60)
return 0
def main():
"""Main entry point."""
return run_benchmark()
if __name__ == '__main__':
sys.exit(main())