File size: 6,579 Bytes
d575ce4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
#!/usr/bin/env python3
"""
CUDA-optimized benchmark script for Ursa Minor Smashed model
"""
import torch
import time
import argparse
from inference_cuda import generate_direct, load_model_direct
def benchmark_generation(model, num_runs=5, prompt="The quick brown fox", max_tokens=100):
"""Benchmark text generation performance on CUDA"""
print(f"๐ Running {num_runs} generation benchmarks on CUDA...")
print(f"๐ Prompt: '{prompt}'")
print(f"๐ฏ Max tokens: {max_tokens}")
print("-" * 50)
times = []
token_counts = []
for i in range(num_runs):
print(f"Run {i+1}/{num_runs}...", end=" ")
start_time = time.time()
result = generate_direct(
model,
prompt,
max_new_tokens=max_tokens,
temperature=0.8,
top_k=50, # Higher for CUDA
top_p=0.9
)
end_time = time.time()
generation_time = end_time - start_time
# Count tokens in generated text (approximate)
import tiktoken
enc = tiktoken.get_encoding("gpt2")
total_tokens = len(enc.encode(result))
prompt_tokens = len(enc.encode(prompt))
generated_tokens = total_tokens - prompt_tokens
times.append(generation_time)
token_counts.append(generated_tokens)
tokens_per_second = generated_tokens / generation_time
print(f"โก {tokens_per_second:.1f} tokens/sec ({generation_time:.2f}s, {generated_tokens} tokens)")
# Calculate statistics
avg_time = sum(times) / len(times)
avg_tokens = sum(token_counts) / len(token_counts)
avg_tokens_per_sec = avg_tokens / avg_time
print("\n๐ CUDA Benchmark Results:")
print("-" * 30)
print(f"Average generation time: {avg_time:.2f} seconds")
print(f"Average tokens generated: {avg_tokens:.1f}")
print(f"Average tokens/second: {avg_tokens_per_sec:.1f}")
print(f"Best tokens/second: {max(token_counts[i]/times[i] for i in range(len(times))):.1f}")
print(f"GPU Memory Usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"GPU Memory Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
def benchmark_memory_usage(model):
"""Benchmark memory usage on CUDA"""
print("\n๐ง CUDA Memory Usage Analysis:")
print("-" * 30)
# Clear cache
torch.cuda.empty_cache()
baseline_memory = torch.cuda.memory_allocated()
print(f"Baseline GPU memory: {baseline_memory / 1024**3:.3f} GB")
# Test different sequence lengths
test_lengths = [50, 100, 200, 500]
for length in test_lengths:
torch.cuda.empty_cache()
# Generate with specific length
prompt = "Test prompt for memory benchmark " * 5
start_memory = torch.cuda.memory_allocated()
result = generate_direct(
model,
prompt,
max_new_tokens=length,
temperature=0.8
)
peak_memory = torch.cuda.memory_allocated()
memory_increase = peak_memory - start_memory
print(f"Tokens {length:3d}: +{memory_increase / 1024**2:.1f} MB (Peak: {peak_memory / 1024**3:.3f} GB)")
def benchmark_different_parameters(model):
"""Benchmark different generation parameters on CUDA"""
print("\nโ๏ธ CUDA Parameter Performance Comparison:")
print("-" * 40)
prompt = "Artificial intelligence is revolutionizing"
base_params = {"max_new_tokens": 100}
test_configs = [
{"name": "Conservative", "temperature": 0.3, "top_k": 20, "top_p": 0.8},
{"name": "Balanced", "temperature": 0.7, "top_k": 50, "top_p": 0.9},
{"name": "Creative", "temperature": 1.0, "top_k": 100, "top_p": 0.95},
{"name": "High Top-K", "temperature": 0.8, "top_k": 200, "top_p": 0.9},
]
for config in test_configs:
params = {**base_params, **{k: v for k, v in config.items() if k != "name"}}
print(f"\n{config['name']} settings:", end=" ")
start_time = time.time()
result = generate_direct(model, prompt, **params)
end_time = time.time()
# Count tokens
import tiktoken
enc = tiktoken.get_encoding("gpt2")
generated_tokens = len(enc.encode(result)) - len(enc.encode(prompt))
tokens_per_sec = generated_tokens / (end_time - start_time)
print(f"โก {tokens_per_sec:.1f} tokens/sec")
def main():
parser = argparse.ArgumentParser(description="Benchmark Ursa Minor Smashed model on CUDA")
parser.add_argument("--model", type=str, default="model_optimized.pt",
help="Path to model checkpoint")
parser.add_argument("--runs", type=int, default=5,
help="Number of benchmark runs")
parser.add_argument("--max-tokens", type=int, default=100,
help="Maximum tokens to generate")
parser.add_argument("--prompt", type=str, default="The future of artificial intelligence",
help="Prompt for benchmarking")
parser.add_argument("--memory-test", action="store_true",
help="Run memory usage tests")
parser.add_argument("--param-test", action="store_true",
help="Test different parameters")
args = parser.parse_args()
if not torch.cuda.is_available():
print("โ ERROR: CUDA is not available. Use benchmark_cpu.py for CPU benchmarking.")
return
print("๐ฅ CUDA Benchmark for Ursa Minor Smashed")
print("=" * 50)
print(f"GPU: {torch.cuda.get_device_name()}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"PyTorch Version: {torch.__version__}")
print()
# Load model
print("Loading model on CUDA...")
model = load_model_direct(args.model)
print("โ
Model loaded!")
# Run basic benchmark
benchmark_generation(model, args.runs, args.prompt, args.max_tokens)
# Run memory test if requested
if args.memory_test:
benchmark_memory_usage(model)
# Run parameter test if requested
if args.param_test:
benchmark_different_parameters(model)
print("\n๐ CUDA Benchmarking complete!")
if __name__ == "__main__":
main() |