#!/usr/bin/env python3 """ CPU-optimized benchmark script for Ursa Minor Smashed model """ import torch import time import argparse import psutil from inference_cpu import generate_direct, load_model_direct def benchmark_generation(model, num_runs=3, prompt="The quick brown fox", max_tokens=80): """Benchmark text generation performance on CPU""" print(f"šŸš€ Running {num_runs} generation benchmarks on CPU...") print(f"šŸ“ Prompt: '{prompt}'") print(f"šŸŽÆ Max tokens: {max_tokens}") print("-" * 50) times = [] token_counts = [] for i in range(num_runs): print(f"Run {i+1}/{num_runs}...", end=" ") start_time = time.time() result = generate_direct( model, prompt, max_new_tokens=max_tokens, temperature=0.8, top_k=30, # Lower for CPU efficiency top_p=0.9 ) end_time = time.time() generation_time = end_time - start_time # Count tokens in generated text (approximate) import tiktoken enc = tiktoken.get_encoding("gpt2") total_tokens = len(enc.encode(result)) prompt_tokens = len(enc.encode(prompt)) generated_tokens = total_tokens - prompt_tokens times.append(generation_time) token_counts.append(generated_tokens) tokens_per_second = generated_tokens / generation_time print(f"⚔ {tokens_per_second:.1f} tokens/sec ({generation_time:.2f}s, {generated_tokens} tokens)") # Calculate statistics avg_time = sum(times) / len(times) avg_tokens = sum(token_counts) / len(token_counts) avg_tokens_per_sec = avg_tokens / avg_time print("\nšŸ“Š CPU Benchmark Results:") print("-" * 30) print(f"Average generation time: {avg_time:.2f} seconds") print(f"Average tokens generated: {avg_tokens:.1f}") print(f"Average tokens/second: {avg_tokens_per_sec:.1f}") print(f"Best tokens/second: {max(token_counts[i]/times[i] for i in range(len(times))):.1f}") # CPU info process = psutil.Process() memory_info = process.memory_info() print(f"CPU Memory Usage: {memory_info.rss / 1024**3:.2f} GB") print(f"CPU Threads Used: {torch.get_num_threads()}") def benchmark_memory_usage(model): """Benchmark memory usage on CPU""" print("\n🧠 CPU Memory Usage Analysis:") print("-" * 30) process = psutil.Process() baseline_memory = process.memory_info().rss print(f"Baseline memory: {baseline_memory / 1024**3:.3f} GB") # Test different sequence lengths (smaller for CPU) test_lengths = [25, 50, 100, 150] for length in test_lengths: # Generate with specific length prompt = "Test prompt for memory benchmark " * 3 start_memory = process.memory_info().rss result = generate_direct( model, prompt, max_new_tokens=length, temperature=0.8 ) peak_memory = process.memory_info().rss memory_increase = peak_memory - start_memory print(f"Tokens {length:3d}: +{memory_increase / 1024**2:.1f} MB (Peak: {peak_memory / 1024**3:.3f} GB)") def benchmark_different_parameters(model): """Benchmark different generation parameters on CPU""" print("\nāš™ļø CPU Parameter Performance Comparison:") print("-" * 40) prompt = "Artificial intelligence is revolutionizing" base_params = {"max_new_tokens": 80} # Lower for CPU test_configs = [ {"name": "Conservative", "temperature": 0.3, "top_k": 15, "top_p": 0.8}, {"name": "Balanced", "temperature": 0.7, "top_k": 30, "top_p": 0.9}, {"name": "Creative", "temperature": 1.0, "top_k": 50, "top_p": 0.95}, {"name": "High Top-K", "temperature": 0.8, "top_k": 80, "top_p": 0.9}, ] for config in test_configs: params = {**base_params, **{k: v for k, v in config.items() if k != "name"}} print(f"\n{config['name']} settings:", end=" ") start_time = time.time() result = generate_direct(model, prompt, **params) end_time = time.time() # Count tokens import tiktoken enc = tiktoken.get_encoding("gpt2") generated_tokens = len(enc.encode(result)) - len(enc.encode(prompt)) tokens_per_sec = generated_tokens / (end_time - start_time) print(f"⚔ {tokens_per_sec:.1f} tokens/sec") def benchmark_cpu_optimization(model): """Test different CPU optimization settings""" print("\nšŸ”§ CPU Optimization Tests:") print("-" * 30) prompt = "The future of computing involves" test_params = {"max_new_tokens": 50, "temperature": 0.8} # Test different thread counts original_threads = torch.get_num_threads() thread_counts = [1, 2, 4, original_threads] for threads in thread_counts: torch.set_num_threads(threads) print(f"\nTesting with {threads} threads:", end=" ") start_time = time.time() result = generate_direct(model, prompt, **test_params) end_time = time.time() # Count tokens import tiktoken enc = tiktoken.get_encoding("gpt2") generated_tokens = len(enc.encode(result)) - len(enc.encode(prompt)) tokens_per_sec = generated_tokens / (end_time - start_time) print(f"⚔ {tokens_per_sec:.1f} tokens/sec") # Restore original thread count torch.set_num_threads(original_threads) def main(): parser = argparse.ArgumentParser(description="Benchmark Ursa Minor Smashed model on CPU") parser.add_argument("--model", type=str, default="model_optimized.pt", help="Path to model checkpoint") parser.add_argument("--runs", type=int, default=3, help="Number of benchmark runs (lower for CPU)") parser.add_argument("--max-tokens", type=int, default=80, help="Maximum tokens to generate (optimized for CPU)") parser.add_argument("--prompt", type=str, default="The future of artificial intelligence", help="Prompt for benchmarking") parser.add_argument("--memory-test", action="store_true", help="Run memory usage tests") parser.add_argument("--param-test", action="store_true", help="Test different parameters") parser.add_argument("--cpu-optimization", action="store_true", help="Test CPU optimization settings") args = parser.parse_args() print("šŸ’» CPU Benchmark for Ursa Minor Smashed") print("=" * 50) print(f"CPU: {psutil.cpu_count()} cores") print(f"CPU Threads: {torch.get_num_threads()}") print(f"PyTorch Version: {torch.__version__}") print(f"Available Memory: {psutil.virtual_memory().total / 1024**3:.1f} GB") print() # Load model print("Loading model on CPU...") model = load_model_direct(args.model) print("āœ… Model loaded!") # Run basic benchmark benchmark_generation(model, args.runs, args.prompt, args.max_tokens) # Run memory test if requested if args.memory_test: benchmark_memory_usage(model) # Run parameter test if requested if args.param_test: benchmark_different_parameters(model) # Run CPU optimization test if requested if args.cpu_optimization: benchmark_cpu_optimization(model) print("\nšŸŽ‰ CPU Benchmarking complete!") if __name__ == "__main__": main()