File size: 3,877 Bytes
708f4a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
XERV Crayon CLI - Command Line Interface
=========================================
Provides command-line tools for benchmarking and vocabulary management.
"""
import sys
import time
import argparse


def run_benchmark():
    """Run a quick benchmark of the Crayon tokenizer."""
    parser = argparse.ArgumentParser(
        prog='crayon-benchmark',
        description='XERV Crayon Tokenizer Benchmark Tool'
    )
    parser.add_argument(
        '--profile', '-p',
        default='lite',
        choices=['lite', 'standard'],
        help='Vocabulary profile to use (default: lite)'
    )
    parser.add_argument(
        '--iterations', '-n',
        type=int,
        default=10,
        help='Number of benchmark iterations (default: 10)'
    )
    parser.add_argument(
        '--text', '-t',
        default=None,
        help='Custom text to tokenize (default: built-in test text)'
    )
    
    args = parser.parse_args()
    
    print("=" * 60)
    print("XERV CRAYON TOKENIZER BENCHMARK")
    print("=" * 60)
    
    try:
        from crayon import CrayonVocab
    except ImportError as e:
        print(f"[ERROR] Failed to import crayon: {e}")
        print("Make sure xerv-crayon is properly installed.")
        sys.exit(1)
    
    # Load vocabulary
    print(f"\n[INFO] Loading profile: {args.profile}")
    start = time.perf_counter()
    
    try:
        vocab = CrayonVocab.load_profile(args.profile)
    except Exception as e:
        print(f"[ERROR] Failed to load profile: {e}")
        sys.exit(1)
    
    load_time = (time.perf_counter() - start) * 1000
    
    if vocab.fast_mode:
        print(f"[OK] Loaded with AVX2 engine ({load_time:.2f}ms)")
    else:
        print(f"[WARN] Loaded in fallback mode ({load_time:.2f}ms)")
    
    # Prepare test text
    if args.text:
        test_text = args.text
    else:
        test_text = """
def matrix_multiply(A, B):
    # Standard O(n^3) matrix multiplication
    result = [[0 for _ in range(len(B[0]))] for _ in range(len(A))]
    for i in range(len(A)):
        for j in range(len(B[0])):
            for k in range(len(B)):
                result[i][j] += A[i][k] * B[k][j]
    return result

The quick brown fox jumps over the lazy dog. 
Machine learning models require efficient tokenization for optimal performance.
""" * 100  # Repeat for meaningful benchmark
    
    text_size = len(test_text.encode('utf-8'))
    print(f"\n[INFO] Test text size: {text_size:,} bytes ({text_size/1024:.1f} KB)")
    print(f"[INFO] Iterations: {args.iterations}")
    
    # Warmup
    print("\n[INFO] Warming up...")
    for _ in range(2):
        _ = vocab.tokenize(test_text)
    
    # Benchmark
    print("[INFO] Running benchmark...")
    times = []
    token_counts = []
    
    for i in range(args.iterations):
        start = time.perf_counter()
        tokens = vocab.tokenize(test_text)
        elapsed = time.perf_counter() - start
        times.append(elapsed)
        token_counts.append(len(tokens))
    
    # Calculate metrics
    avg_time = sum(times) / len(times)
    min_time = min(times)
    max_time = max(times)
    avg_tokens = sum(token_counts) / len(token_counts)
    tokens_per_sec = avg_tokens / avg_time
    mb_per_sec = (text_size / 1024 / 1024) / avg_time
    
    # Print results
    print("\n" + "=" * 60)
    print("RESULTS")
    print("=" * 60)
    print(f"  Profile:        {args.profile}")
    print(f"  Token Count:    {int(avg_tokens):,}")
    print(f"  Tokens/sec:     {tokens_per_sec:,.0f}")
    print(f"  MB/sec:         {mb_per_sec:.2f}")
    print(f"  Avg Time:       {avg_time*1000:.2f}ms")
    print(f"  Min Time:       {min_time*1000:.2f}ms")
    print(f"  Max Time:       {max_time*1000:.2f}ms")
    print("=" * 60)
    
    return 0


def main():
    """Main entry point."""
    return run_benchmark()


if __name__ == '__main__':
    sys.exit(main())