File size: 7,939 Bytes
d575ce4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python3
"""

CPU-optimized benchmark script for Ursa Minor Smashed model

"""

import torch
import time
import argparse
import psutil
from inference_cpu import generate_direct, load_model_direct

def benchmark_generation(model, num_runs=3, prompt="The quick brown fox", max_tokens=80):
    """Benchmark text generation performance on CPU"""
    
    print(f"🚀 Running {num_runs} generation benchmarks on CPU...")
    print(f"📝 Prompt: '{prompt}'")
    print(f"🎯 Max tokens: {max_tokens}")
    print("-" * 50)
    
    times = []
    token_counts = []
    
    for i in range(num_runs):
        print(f"Run {i+1}/{num_runs}...", end=" ")
        
        start_time = time.time()
        
        result = generate_direct(
            model,
            prompt,
            max_new_tokens=max_tokens,
            temperature=0.8,
            top_k=30,  # Lower for CPU efficiency
            top_p=0.9
        )
        
        end_time = time.time()
        
        generation_time = end_time - start_time
        
        # Count tokens in generated text (approximate)
        import tiktoken
        enc = tiktoken.get_encoding("gpt2")
        total_tokens = len(enc.encode(result))
        prompt_tokens = len(enc.encode(prompt))
        generated_tokens = total_tokens - prompt_tokens
        
        times.append(generation_time)
        token_counts.append(generated_tokens)
        
        tokens_per_second = generated_tokens / generation_time
        print(f"⚡ {tokens_per_second:.1f} tokens/sec ({generation_time:.2f}s, {generated_tokens} tokens)")
    
    # Calculate statistics
    avg_time = sum(times) / len(times)
    avg_tokens = sum(token_counts) / len(token_counts)
    avg_tokens_per_sec = avg_tokens / avg_time
    
    print("\n📊 CPU Benchmark Results:")
    print("-" * 30)
    print(f"Average generation time: {avg_time:.2f} seconds")
    print(f"Average tokens generated: {avg_tokens:.1f}")
    print(f"Average tokens/second: {avg_tokens_per_sec:.1f}")
    print(f"Best tokens/second: {max(token_counts[i]/times[i] for i in range(len(times))):.1f}")
    
    # CPU info
    process = psutil.Process()
    memory_info = process.memory_info()
    print(f"CPU Memory Usage: {memory_info.rss / 1024**3:.2f} GB")
    print(f"CPU Threads Used: {torch.get_num_threads()}")

def benchmark_memory_usage(model):
    """Benchmark memory usage on CPU"""
    
    print("\n🧠 CPU Memory Usage Analysis:")
    print("-" * 30)
    
    process = psutil.Process()
    baseline_memory = process.memory_info().rss
    
    print(f"Baseline memory: {baseline_memory / 1024**3:.3f} GB")
    
    # Test different sequence lengths (smaller for CPU)
    test_lengths = [25, 50, 100, 150]
    
    for length in test_lengths:
        # Generate with specific length
        prompt = "Test prompt for memory benchmark " * 3
        
        start_memory = process.memory_info().rss
        
        result = generate_direct(
            model,
            prompt,
            max_new_tokens=length,
            temperature=0.8
        )
        
        peak_memory = process.memory_info().rss
        memory_increase = peak_memory - start_memory
        
        print(f"Tokens {length:3d}: +{memory_increase / 1024**2:.1f} MB (Peak: {peak_memory / 1024**3:.3f} GB)")

def benchmark_different_parameters(model):
    """Benchmark different generation parameters on CPU"""
    
    print("\n⚙️ CPU Parameter Performance Comparison:")
    print("-" * 40)
    
    prompt = "Artificial intelligence is revolutionizing"
    base_params = {"max_new_tokens": 80}  # Lower for CPU
    
    test_configs = [
        {"name": "Conservative", "temperature": 0.3, "top_k": 15, "top_p": 0.8},
        {"name": "Balanced", "temperature": 0.7, "top_k": 30, "top_p": 0.9},
        {"name": "Creative", "temperature": 1.0, "top_k": 50, "top_p": 0.95},
        {"name": "High Top-K", "temperature": 0.8, "top_k": 80, "top_p": 0.9},
    ]
    
    for config in test_configs:
        params = {**base_params, **{k: v for k, v in config.items() if k != "name"}}
        
        print(f"\n{config['name']} settings:", end=" ")
        
        start_time = time.time()
        result = generate_direct(model, prompt, **params)
        end_time = time.time()
        
        # Count tokens
        import tiktoken
        enc = tiktoken.get_encoding("gpt2")
        generated_tokens = len(enc.encode(result)) - len(enc.encode(prompt))
        
        tokens_per_sec = generated_tokens / (end_time - start_time)
        print(f"⚡ {tokens_per_sec:.1f} tokens/sec")

def benchmark_cpu_optimization(model):
    """Test different CPU optimization settings"""
    
    print("\n🔧 CPU Optimization Tests:")
    print("-" * 30)
    
    prompt = "The future of computing involves"
    test_params = {"max_new_tokens": 50, "temperature": 0.8}
    
    # Test different thread counts
    original_threads = torch.get_num_threads()
    thread_counts = [1, 2, 4, original_threads]
    
    for threads in thread_counts:
        torch.set_num_threads(threads)
        
        print(f"\nTesting with {threads} threads:", end=" ")
        
        start_time = time.time()
        result = generate_direct(model, prompt, **test_params)
        end_time = time.time()
        
        # Count tokens
        import tiktoken
        enc = tiktoken.get_encoding("gpt2")
        generated_tokens = len(enc.encode(result)) - len(enc.encode(prompt))
        
        tokens_per_sec = generated_tokens / (end_time - start_time)
        print(f"⚡ {tokens_per_sec:.1f} tokens/sec")
    
    # Restore original thread count
    torch.set_num_threads(original_threads)

def main():
    parser = argparse.ArgumentParser(description="Benchmark Ursa Minor Smashed model on CPU")
    parser.add_argument("--model", type=str, default="model_optimized.pt",
                        help="Path to model checkpoint")
    parser.add_argument("--runs", type=int, default=3,
                        help="Number of benchmark runs (lower for CPU)")
    parser.add_argument("--max-tokens", type=int, default=80,
                        help="Maximum tokens to generate (optimized for CPU)")
    parser.add_argument("--prompt", type=str, default="The future of artificial intelligence",
                        help="Prompt for benchmarking")
    parser.add_argument("--memory-test", action="store_true",
                        help="Run memory usage tests")
    parser.add_argument("--param-test", action="store_true",
                        help="Test different parameters")
    parser.add_argument("--cpu-optimization", action="store_true",
                        help="Test CPU optimization settings")
    
    args = parser.parse_args()
    
    print("💻 CPU Benchmark for Ursa Minor Smashed")
    print("=" * 50)
    print(f"CPU: {psutil.cpu_count()} cores")
    print(f"CPU Threads: {torch.get_num_threads()}")
    print(f"PyTorch Version: {torch.__version__}")
    print(f"Available Memory: {psutil.virtual_memory().total / 1024**3:.1f} GB")
    print()
    
    # Load model
    print("Loading model on CPU...")
    model = load_model_direct(args.model)
    print("✅ Model loaded!")
    
    # Run basic benchmark
    benchmark_generation(model, args.runs, args.prompt, args.max_tokens)
    
    # Run memory test if requested
    if args.memory_test:
        benchmark_memory_usage(model)
    
    # Run parameter test if requested
    if args.param_test:
        benchmark_different_parameters(model)
    
    # Run CPU optimization test if requested
    if args.cpu_optimization:
        benchmark_cpu_optimization(model)
    
    print("\n🎉 CPU Benchmarking complete!")

if __name__ == "__main__":
    main()