9x25dillon's picture
Initial upload of LiMp Pipeline Integration System
22ae78a verified
#!/usr/bin/env python3
"""
Simple LLM Benchmark
"""
import time
import json
from datetime import datetime
def benchmark_enhanced_tokenizer():
"""Test our enhanced tokenizer."""
print("Testing Enhanced Tokenizer...")
try:
from enhanced_tokenizer_minimal import MinimalEnhancedTokenizer
import asyncio
tokenizer = MinimalEnhancedTokenizer()
test_texts = [
"Hello world! This is a test.",
"The equation x^2 + y^2 = z^2 is fundamental.",
"def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)",
]
results = []
total_time = 0
total_tokens = 0
for text in test_texts:
start_time = time.time()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
result = loop.run_until_complete(tokenizer.tokenize(text))
loop.close()
end_time = time.time()
processing_time = end_time - start_time
results.append({
"text": text[:50] + "...",
"tokens": result.token_count,
"time": processing_time,
"type": result.semantic_features.get("content_type", "unknown"),
"entities": len(result.entities),
"math": len(result.math_expressions)
})
total_time += processing_time
total_tokens += result.token_count
return {
"name": "Enhanced Advanced Tokenizer",
"total_tokens": total_tokens,
"total_time": total_time,
"tokens_per_second": total_tokens / total_time if total_time > 0 else 0,
"features": {
"semantic_embeddings": True,
"mathematical_processing": True,
"named_entity_recognition": True,
"fractal_analysis": True,
"content_type_detection": True
},
"tests": results
}
except Exception as e:
return {"name": "Enhanced Tokenizer", "error": str(e)}
def benchmark_basic_tokenizer():
"""Test basic tokenization."""
print("Testing Basic Tokenizer...")
test_texts = [
"Hello world! This is a test.",
"The equation x^2 + y^2 = z^2 is fundamental.",
"def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)",
]
results = []
total_time = 0
total_tokens = 0
for text in test_texts:
start_time = time.time()
tokens = text.split()
token_count = len(tokens)
end_time = time.time()
processing_time = end_time - start_time
results.append({
"text": text[:50] + "...",
"tokens": token_count,
"time": processing_time,
"type": "basic",
"entities": 0,
"math": 0
})
total_time += processing_time
total_tokens += token_count
return {
"name": "Basic Python Tokenizer",
"total_tokens": total_tokens,
"total_time": total_time,
"tokens_per_second": total_tokens / total_time if total_time > 0 else 0,
"features": {
"semantic_embeddings": False,
"mathematical_processing": False,
"named_entity_recognition": False,
"fractal_analysis": False,
"content_type_detection": False
},
"tests": results
}
def main():
print("🚀 LLM Benchmark Comparison")
print("=" * 40)
# Run benchmarks
enhanced_results = benchmark_enhanced_tokenizer()
basic_results = benchmark_basic_tokenizer()
# Create comparison
comparison = {
"timestamp": datetime.now().isoformat(),
"benchmarks": [enhanced_results, basic_results],
"summary": {
"enhanced_speed": enhanced_results.get("tokens_per_second", 0),
"basic_speed": basic_results.get("tokens_per_second", 0),
"enhanced_features": sum(enhanced_results.get("features", {}).values()),
"basic_features": sum(basic_results.get("features", {}).values())
}
}
# Print results
print("\n📊 Results:")
print(f"Enhanced Tokenizer: {enhanced_results.get('tokens_per_second', 0):.1f} tokens/sec")
print(f"Basic Tokenizer: {basic_results.get('tokens_per_second', 0):.1f} tokens/sec")
print(f"Enhanced Features: {comparison['summary']['enhanced_features']}/5")
print(f"Basic Features: {comparison['summary']['basic_features']}/5")
# Save results
with open("benchmark_results.json", "w") as f:
json.dump(comparison, f, indent=2)
print("\n✅ Benchmark complete! Results saved to benchmark_results.json")
# Recommendations
print("\n💡 Recommendations:")
if enhanced_results.get("tokens_per_second", 0) > basic_results.get("tokens_per_second", 0):
print("• Enhanced tokenizer is faster than basic")
else:
print("• Basic tokenizer is faster (expected due to complexity)")
if comparison['summary']['enhanced_features'] > comparison['summary']['basic_features']:
print("• Enhanced tokenizer offers significantly more features")
print("• Enhanced tokenizer is best for production AI applications")
print("• Use enhanced tokenizer for multi-modal content processing")
if __name__ == "__main__":
main()