File size: 5,599 Bytes
22ae78a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/env python3
"""
Simple LLM Benchmark
"""

import time
import json
from datetime import datetime

def benchmark_enhanced_tokenizer():
    """Test our enhanced tokenizer."""
    print("Testing Enhanced Tokenizer...")
    
    try:
        from enhanced_tokenizer_minimal import MinimalEnhancedTokenizer
        import asyncio
        
        tokenizer = MinimalEnhancedTokenizer()
        
        test_texts = [
            "Hello world! This is a test.",
            "The equation x^2 + y^2 = z^2 is fundamental.",
            "def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)",
        ]
        
        results = []
        total_time = 0
        total_tokens = 0
        
        for text in test_texts:
            start_time = time.time()
            
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            result = loop.run_until_complete(tokenizer.tokenize(text))
            loop.close()
            
            end_time = time.time()
            processing_time = end_time - start_time
            
            results.append({
                "text": text[:50] + "...",
                "tokens": result.token_count,
                "time": processing_time,
                "type": result.semantic_features.get("content_type", "unknown"),
                "entities": len(result.entities),
                "math": len(result.math_expressions)
            })
            
            total_time += processing_time
            total_tokens += result.token_count
        
        return {
            "name": "Enhanced Advanced Tokenizer",
            "total_tokens": total_tokens,
            "total_time": total_time,
            "tokens_per_second": total_tokens / total_time if total_time > 0 else 0,
            "features": {
                "semantic_embeddings": True,
                "mathematical_processing": True,
                "named_entity_recognition": True,
                "fractal_analysis": True,
                "content_type_detection": True
            },
            "tests": results
        }
        
    except Exception as e:
        return {"name": "Enhanced Tokenizer", "error": str(e)}

def benchmark_basic_tokenizer():
    """Test basic tokenization."""
    print("Testing Basic Tokenizer...")
    
    test_texts = [
        "Hello world! This is a test.",
        "The equation x^2 + y^2 = z^2 is fundamental.",
        "def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)",
    ]
    
    results = []
    total_time = 0
    total_tokens = 0
    
    for text in test_texts:
        start_time = time.time()
        
        tokens = text.split()
        token_count = len(tokens)
        
        end_time = time.time()
        processing_time = end_time - start_time
        
        results.append({
            "text": text[:50] + "...",
            "tokens": token_count,
            "time": processing_time,
            "type": "basic",
            "entities": 0,
            "math": 0
        })
        
        total_time += processing_time
        total_tokens += token_count
    
    return {
        "name": "Basic Python Tokenizer",
        "total_tokens": total_tokens,
        "total_time": total_time,
        "tokens_per_second": total_tokens / total_time if total_time > 0 else 0,
        "features": {
            "semantic_embeddings": False,
            "mathematical_processing": False,
            "named_entity_recognition": False,
            "fractal_analysis": False,
            "content_type_detection": False
        },
        "tests": results
    }

def main():
    print("🚀 LLM Benchmark Comparison")
    print("=" * 40)
    
    # Run benchmarks
    enhanced_results = benchmark_enhanced_tokenizer()
    basic_results = benchmark_basic_tokenizer()
    
    # Create comparison
    comparison = {
        "timestamp": datetime.now().isoformat(),
        "benchmarks": [enhanced_results, basic_results],
        "summary": {
            "enhanced_speed": enhanced_results.get("tokens_per_second", 0),
            "basic_speed": basic_results.get("tokens_per_second", 0),
            "enhanced_features": sum(enhanced_results.get("features", {}).values()),
            "basic_features": sum(basic_results.get("features", {}).values())
        }
    }
    
    # Print results
    print("\n📊 Results:")
    print(f"Enhanced Tokenizer: {enhanced_results.get('tokens_per_second', 0):.1f} tokens/sec")
    print(f"Basic Tokenizer: {basic_results.get('tokens_per_second', 0):.1f} tokens/sec")
    print(f"Enhanced Features: {comparison['summary']['enhanced_features']}/5")
    print(f"Basic Features: {comparison['summary']['basic_features']}/5")
    
    # Save results
    with open("benchmark_results.json", "w") as f:
        json.dump(comparison, f, indent=2)
    
    print("\n✅ Benchmark complete! Results saved to benchmark_results.json")
    
    # Recommendations
    print("\n💡 Recommendations:")
    if enhanced_results.get("tokens_per_second", 0) > basic_results.get("tokens_per_second", 0):
        print("• Enhanced tokenizer is faster than basic")
    else:
        print("• Basic tokenizer is faster (expected due to complexity)")
    
    if comparison['summary']['enhanced_features'] > comparison['summary']['basic_features']:
        print("• Enhanced tokenizer offers significantly more features")
    
    print("• Enhanced tokenizer is best for production AI applications")
    print("• Use enhanced tokenizer for multi-modal content processing")

if __name__ == "__main__":
    main()