#!/usr/bin/env python3 """ Simple LLM Benchmark """ import time import json from datetime import datetime def benchmark_enhanced_tokenizer(): """Test our enhanced tokenizer.""" print("Testing Enhanced Tokenizer...") try: from enhanced_tokenizer_minimal import MinimalEnhancedTokenizer import asyncio tokenizer = MinimalEnhancedTokenizer() test_texts = [ "Hello world! This is a test.", "The equation x^2 + y^2 = z^2 is fundamental.", "def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)", ] results = [] total_time = 0 total_tokens = 0 for text in test_texts: start_time = time.time() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) result = loop.run_until_complete(tokenizer.tokenize(text)) loop.close() end_time = time.time() processing_time = end_time - start_time results.append({ "text": text[:50] + "...", "tokens": result.token_count, "time": processing_time, "type": result.semantic_features.get("content_type", "unknown"), "entities": len(result.entities), "math": len(result.math_expressions) }) total_time += processing_time total_tokens += result.token_count return { "name": "Enhanced Advanced Tokenizer", "total_tokens": total_tokens, "total_time": total_time, "tokens_per_second": total_tokens / total_time if total_time > 0 else 0, "features": { "semantic_embeddings": True, "mathematical_processing": True, "named_entity_recognition": True, "fractal_analysis": True, "content_type_detection": True }, "tests": results } except Exception as e: return {"name": "Enhanced Tokenizer", "error": str(e)} def benchmark_basic_tokenizer(): """Test basic tokenization.""" print("Testing Basic Tokenizer...") test_texts = [ "Hello world! This is a test.", "The equation x^2 + y^2 = z^2 is fundamental.", "def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)", ] results = [] total_time = 0 total_tokens = 0 for text in test_texts: start_time = time.time() tokens = text.split() token_count = len(tokens) end_time = time.time() processing_time = end_time - start_time results.append({ "text": text[:50] + "...", "tokens": token_count, "time": processing_time, "type": "basic", "entities": 0, "math": 0 }) total_time += processing_time total_tokens += token_count return { "name": "Basic Python Tokenizer", "total_tokens": total_tokens, "total_time": total_time, "tokens_per_second": total_tokens / total_time if total_time > 0 else 0, "features": { "semantic_embeddings": False, "mathematical_processing": False, "named_entity_recognition": False, "fractal_analysis": False, "content_type_detection": False }, "tests": results } def main(): print("šŸš€ LLM Benchmark Comparison") print("=" * 40) # Run benchmarks enhanced_results = benchmark_enhanced_tokenizer() basic_results = benchmark_basic_tokenizer() # Create comparison comparison = { "timestamp": datetime.now().isoformat(), "benchmarks": [enhanced_results, basic_results], "summary": { "enhanced_speed": enhanced_results.get("tokens_per_second", 0), "basic_speed": basic_results.get("tokens_per_second", 0), "enhanced_features": sum(enhanced_results.get("features", {}).values()), "basic_features": sum(basic_results.get("features", {}).values()) } } # Print results print("\nšŸ“Š Results:") print(f"Enhanced Tokenizer: {enhanced_results.get('tokens_per_second', 0):.1f} tokens/sec") print(f"Basic Tokenizer: {basic_results.get('tokens_per_second', 0):.1f} tokens/sec") print(f"Enhanced Features: {comparison['summary']['enhanced_features']}/5") print(f"Basic Features: {comparison['summary']['basic_features']}/5") # Save results with open("benchmark_results.json", "w") as f: json.dump(comparison, f, indent=2) print("\nāœ… Benchmark complete! Results saved to benchmark_results.json") # Recommendations print("\nšŸ’” Recommendations:") if enhanced_results.get("tokens_per_second", 0) > basic_results.get("tokens_per_second", 0): print("• Enhanced tokenizer is faster than basic") else: print("• Basic tokenizer is faster (expected due to complexity)") if comparison['summary']['enhanced_features'] > comparison['summary']['basic_features']: print("• Enhanced tokenizer offers significantly more features") print("• Enhanced tokenizer is best for production AI applications") print("• Use enhanced tokenizer for multi-modal content processing") if __name__ == "__main__": main()