| """ |
| XERV Crayon CLI - Command Line Interface |
| ========================================= |
| Provides command-line tools for benchmarking and vocabulary management. |
| """ |
| import sys |
| import time |
| import argparse |
|
|
|
|
| def run_benchmark(): |
| """Run a quick benchmark of the Crayon tokenizer.""" |
| parser = argparse.ArgumentParser( |
| prog='crayon-benchmark', |
| description='XERV Crayon Tokenizer Benchmark Tool' |
| ) |
| parser.add_argument( |
| '--profile', '-p', |
| default='lite', |
| choices=['lite', 'standard'], |
| help='Vocabulary profile to use (default: lite)' |
| ) |
| parser.add_argument( |
| '--iterations', '-n', |
| type=int, |
| default=10, |
| help='Number of benchmark iterations (default: 10)' |
| ) |
| parser.add_argument( |
| '--text', '-t', |
| default=None, |
| help='Custom text to tokenize (default: built-in test text)' |
| ) |
| |
| args = parser.parse_args() |
| |
| print("=" * 60) |
| print("XERV CRAYON TOKENIZER BENCHMARK") |
| print("=" * 60) |
| |
| try: |
| from crayon import CrayonVocab |
| except ImportError as e: |
| print(f"[ERROR] Failed to import crayon: {e}") |
| print("Make sure xerv-crayon is properly installed.") |
| sys.exit(1) |
| |
| |
| print(f"\n[INFO] Loading profile: {args.profile}") |
| start = time.perf_counter() |
| |
| try: |
| vocab = CrayonVocab.load_profile(args.profile) |
| except Exception as e: |
| print(f"[ERROR] Failed to load profile: {e}") |
| sys.exit(1) |
| |
| load_time = (time.perf_counter() - start) * 1000 |
| |
| if vocab.fast_mode: |
| print(f"[OK] Loaded with AVX2 engine ({load_time:.2f}ms)") |
| else: |
| print(f"[WARN] Loaded in fallback mode ({load_time:.2f}ms)") |
| |
| |
| if args.text: |
| test_text = args.text |
| else: |
| test_text = """ |
| def matrix_multiply(A, B): |
| # Standard O(n^3) matrix multiplication |
| result = [[0 for _ in range(len(B[0]))] for _ in range(len(A))] |
| for i in range(len(A)): |
| for j in range(len(B[0])): |
| for k in range(len(B)): |
| result[i][j] += A[i][k] * B[k][j] |
| return result |
| |
| The quick brown fox jumps over the lazy dog. |
| Machine learning models require efficient tokenization for optimal performance. |
| """ * 100 |
| |
| text_size = len(test_text.encode('utf-8')) |
| print(f"\n[INFO] Test text size: {text_size:,} bytes ({text_size/1024:.1f} KB)") |
| print(f"[INFO] Iterations: {args.iterations}") |
| |
| |
| print("\n[INFO] Warming up...") |
| for _ in range(2): |
| _ = vocab.tokenize(test_text) |
| |
| |
| print("[INFO] Running benchmark...") |
| times = [] |
| token_counts = [] |
| |
| for i in range(args.iterations): |
| start = time.perf_counter() |
| tokens = vocab.tokenize(test_text) |
| elapsed = time.perf_counter() - start |
| times.append(elapsed) |
| token_counts.append(len(tokens)) |
| |
| |
| avg_time = sum(times) / len(times) |
| min_time = min(times) |
| max_time = max(times) |
| avg_tokens = sum(token_counts) / len(token_counts) |
| tokens_per_sec = avg_tokens / avg_time |
| mb_per_sec = (text_size / 1024 / 1024) / avg_time |
| |
| |
| print("\n" + "=" * 60) |
| print("RESULTS") |
| print("=" * 60) |
| print(f" Profile: {args.profile}") |
| print(f" Token Count: {int(avg_tokens):,}") |
| print(f" Tokens/sec: {tokens_per_sec:,.0f}") |
| print(f" MB/sec: {mb_per_sec:.2f}") |
| print(f" Avg Time: {avg_time*1000:.2f}ms") |
| print(f" Min Time: {min_time*1000:.2f}ms") |
| print(f" Max Time: {max_time*1000:.2f}ms") |
| print("=" * 60) |
| |
| return 0 |
|
|
|
|
| def main(): |
| """Main entry point.""" |
| return run_benchmark() |
|
|
|
|
| if __name__ == '__main__': |
| sys.exit(main()) |
|
|