File size: 5,599 Bytes
22ae78a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
#!/usr/bin/env python3
"""
Simple LLM Benchmark
"""
import time
import json
from datetime import datetime
def benchmark_enhanced_tokenizer():
"""Test our enhanced tokenizer."""
print("Testing Enhanced Tokenizer...")
try:
from enhanced_tokenizer_minimal import MinimalEnhancedTokenizer
import asyncio
tokenizer = MinimalEnhancedTokenizer()
test_texts = [
"Hello world! This is a test.",
"The equation x^2 + y^2 = z^2 is fundamental.",
"def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)",
]
results = []
total_time = 0
total_tokens = 0
for text in test_texts:
start_time = time.time()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
result = loop.run_until_complete(tokenizer.tokenize(text))
loop.close()
end_time = time.time()
processing_time = end_time - start_time
results.append({
"text": text[:50] + "...",
"tokens": result.token_count,
"time": processing_time,
"type": result.semantic_features.get("content_type", "unknown"),
"entities": len(result.entities),
"math": len(result.math_expressions)
})
total_time += processing_time
total_tokens += result.token_count
return {
"name": "Enhanced Advanced Tokenizer",
"total_tokens": total_tokens,
"total_time": total_time,
"tokens_per_second": total_tokens / total_time if total_time > 0 else 0,
"features": {
"semantic_embeddings": True,
"mathematical_processing": True,
"named_entity_recognition": True,
"fractal_analysis": True,
"content_type_detection": True
},
"tests": results
}
except Exception as e:
return {"name": "Enhanced Tokenizer", "error": str(e)}
def benchmark_basic_tokenizer():
"""Test basic tokenization."""
print("Testing Basic Tokenizer...")
test_texts = [
"Hello world! This is a test.",
"The equation x^2 + y^2 = z^2 is fundamental.",
"def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)",
]
results = []
total_time = 0
total_tokens = 0
for text in test_texts:
start_time = time.time()
tokens = text.split()
token_count = len(tokens)
end_time = time.time()
processing_time = end_time - start_time
results.append({
"text": text[:50] + "...",
"tokens": token_count,
"time": processing_time,
"type": "basic",
"entities": 0,
"math": 0
})
total_time += processing_time
total_tokens += token_count
return {
"name": "Basic Python Tokenizer",
"total_tokens": total_tokens,
"total_time": total_time,
"tokens_per_second": total_tokens / total_time if total_time > 0 else 0,
"features": {
"semantic_embeddings": False,
"mathematical_processing": False,
"named_entity_recognition": False,
"fractal_analysis": False,
"content_type_detection": False
},
"tests": results
}
def main():
print("🚀 LLM Benchmark Comparison")
print("=" * 40)
# Run benchmarks
enhanced_results = benchmark_enhanced_tokenizer()
basic_results = benchmark_basic_tokenizer()
# Create comparison
comparison = {
"timestamp": datetime.now().isoformat(),
"benchmarks": [enhanced_results, basic_results],
"summary": {
"enhanced_speed": enhanced_results.get("tokens_per_second", 0),
"basic_speed": basic_results.get("tokens_per_second", 0),
"enhanced_features": sum(enhanced_results.get("features", {}).values()),
"basic_features": sum(basic_results.get("features", {}).values())
}
}
# Print results
print("\n📊 Results:")
print(f"Enhanced Tokenizer: {enhanced_results.get('tokens_per_second', 0):.1f} tokens/sec")
print(f"Basic Tokenizer: {basic_results.get('tokens_per_second', 0):.1f} tokens/sec")
print(f"Enhanced Features: {comparison['summary']['enhanced_features']}/5")
print(f"Basic Features: {comparison['summary']['basic_features']}/5")
# Save results
with open("benchmark_results.json", "w") as f:
json.dump(comparison, f, indent=2)
print("\n✅ Benchmark complete! Results saved to benchmark_results.json")
# Recommendations
print("\n💡 Recommendations:")
if enhanced_results.get("tokens_per_second", 0) > basic_results.get("tokens_per_second", 0):
print("• Enhanced tokenizer is faster than basic")
else:
print("• Basic tokenizer is faster (expected due to complexity)")
if comparison['summary']['enhanced_features'] > comparison['summary']['basic_features']:
print("• Enhanced tokenizer offers significantly more features")
print("• Enhanced tokenizer is best for production AI applications")
print("• Use enhanced tokenizer for multi-modal content processing")
if __name__ == "__main__":
main()
|