File size: 6,579 Bytes
d575ce4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python3
"""

CUDA-optimized benchmark script for Ursa Minor Smashed model

"""

import torch
import time
import argparse
from inference_cuda import generate_direct, load_model_direct

def benchmark_generation(model, num_runs=5, prompt="The quick brown fox", max_tokens=100):
    """Benchmark text generation performance on CUDA"""
    
    print(f"๐Ÿš€ Running {num_runs} generation benchmarks on CUDA...")
    print(f"๐Ÿ“ Prompt: '{prompt}'")
    print(f"๐ŸŽฏ Max tokens: {max_tokens}")
    print("-" * 50)
    
    times = []
    token_counts = []
    
    for i in range(num_runs):
        print(f"Run {i+1}/{num_runs}...", end=" ")
        
        start_time = time.time()
        
        result = generate_direct(
            model,
            prompt,
            max_new_tokens=max_tokens,
            temperature=0.8,
            top_k=50,  # Higher for CUDA
            top_p=0.9
        )
        
        end_time = time.time()
        
        generation_time = end_time - start_time
        
        # Count tokens in generated text (approximate)
        import tiktoken
        enc = tiktoken.get_encoding("gpt2")
        total_tokens = len(enc.encode(result))
        prompt_tokens = len(enc.encode(prompt))
        generated_tokens = total_tokens - prompt_tokens
        
        times.append(generation_time)
        token_counts.append(generated_tokens)
        
        tokens_per_second = generated_tokens / generation_time
        print(f"โšก {tokens_per_second:.1f} tokens/sec ({generation_time:.2f}s, {generated_tokens} tokens)")
    
    # Calculate statistics
    avg_time = sum(times) / len(times)
    avg_tokens = sum(token_counts) / len(token_counts)
    avg_tokens_per_sec = avg_tokens / avg_time
    
    print("\n๐Ÿ“Š CUDA Benchmark Results:")
    print("-" * 30)
    print(f"Average generation time: {avg_time:.2f} seconds")
    print(f"Average tokens generated: {avg_tokens:.1f}")
    print(f"Average tokens/second: {avg_tokens_per_sec:.1f}")
    print(f"Best tokens/second: {max(token_counts[i]/times[i] for i in range(len(times))):.1f}")
    print(f"GPU Memory Usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU Memory Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

def benchmark_memory_usage(model):
    """Benchmark memory usage on CUDA"""
    
    print("\n๐Ÿง  CUDA Memory Usage Analysis:")
    print("-" * 30)
    
    # Clear cache
    torch.cuda.empty_cache()
    baseline_memory = torch.cuda.memory_allocated()
    
    print(f"Baseline GPU memory: {baseline_memory / 1024**3:.3f} GB")
    
    # Test different sequence lengths
    test_lengths = [50, 100, 200, 500]
    
    for length in test_lengths:
        torch.cuda.empty_cache()
        
        # Generate with specific length
        prompt = "Test prompt for memory benchmark " * 5
        
        start_memory = torch.cuda.memory_allocated()
        
        result = generate_direct(
            model,
            prompt,
            max_new_tokens=length,
            temperature=0.8
        )
        
        peak_memory = torch.cuda.memory_allocated()
        memory_increase = peak_memory - start_memory
        
        print(f"Tokens {length:3d}: +{memory_increase / 1024**2:.1f} MB (Peak: {peak_memory / 1024**3:.3f} GB)")

def benchmark_different_parameters(model):
    """Benchmark different generation parameters on CUDA"""
    
    print("\nโš™๏ธ CUDA Parameter Performance Comparison:")
    print("-" * 40)
    
    prompt = "Artificial intelligence is revolutionizing"
    base_params = {"max_new_tokens": 100}
    
    test_configs = [
        {"name": "Conservative", "temperature": 0.3, "top_k": 20, "top_p": 0.8},
        {"name": "Balanced", "temperature": 0.7, "top_k": 50, "top_p": 0.9},
        {"name": "Creative", "temperature": 1.0, "top_k": 100, "top_p": 0.95},
        {"name": "High Top-K", "temperature": 0.8, "top_k": 200, "top_p": 0.9},
    ]
    
    for config in test_configs:
        params = {**base_params, **{k: v for k, v in config.items() if k != "name"}}
        
        print(f"\n{config['name']} settings:", end=" ")
        
        start_time = time.time()
        result = generate_direct(model, prompt, **params)
        end_time = time.time()
        
        # Count tokens
        import tiktoken
        enc = tiktoken.get_encoding("gpt2")
        generated_tokens = len(enc.encode(result)) - len(enc.encode(prompt))
        
        tokens_per_sec = generated_tokens / (end_time - start_time)
        print(f"โšก {tokens_per_sec:.1f} tokens/sec")

def main():
    parser = argparse.ArgumentParser(description="Benchmark Ursa Minor Smashed model on CUDA")
    parser.add_argument("--model", type=str, default="model_optimized.pt",
                        help="Path to model checkpoint")
    parser.add_argument("--runs", type=int, default=5,
                        help="Number of benchmark runs")
    parser.add_argument("--max-tokens", type=int, default=100,
                        help="Maximum tokens to generate")
    parser.add_argument("--prompt", type=str, default="The future of artificial intelligence",
                        help="Prompt for benchmarking")
    parser.add_argument("--memory-test", action="store_true",
                        help="Run memory usage tests")
    parser.add_argument("--param-test", action="store_true",
                        help="Test different parameters")
    
    args = parser.parse_args()
    
    if not torch.cuda.is_available():
        print("โŒ ERROR: CUDA is not available. Use benchmark_cpu.py for CPU benchmarking.")
        return
    
    print("๐Ÿ”ฅ CUDA Benchmark for Ursa Minor Smashed")
    print("=" * 50)
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"PyTorch Version: {torch.__version__}")
    print()
    
    # Load model
    print("Loading model on CUDA...")
    model = load_model_direct(args.model)
    print("โœ… Model loaded!")
    
    # Run basic benchmark
    benchmark_generation(model, args.runs, args.prompt, args.max_tokens)
    
    # Run memory test if requested
    if args.memory_test:
        benchmark_memory_usage(model)
    
    # Run parameter test if requested
    if args.param_test:
        benchmark_different_parameters(model)
    
    print("\n๐ŸŽ‰ CUDA Benchmarking complete!")

if __name__ == "__main__":
    main()