local_copilot / scripts /benchmark.py
Kash6's picture
Deploy AI Coding Assistant
04653e2
"""Benchmark script to measure and validate performance improvements."""
import time
import json
import torch
from pathlib import Path
import sys
sys.path.append('..')
from app.lora_model import OptimizedModelLoader
from app.config import Config
def benchmark_inference(model, tokenizer, prompts, num_runs=10):
"""Benchmark model inference speed."""
latencies = []
tokens_generated = []
for prompt in prompts:
for _ in range(num_runs):
start = time.time()
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=1024,
temperature=0.3,
num_return_sequences=1,
pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
)
latency = time.time() - start
latencies.append(latency)
tokens_generated.append(outputs.shape[1])
return {
"avg_latency": sum(latencies) / len(latencies),
"min_latency": min(latencies),
"max_latency": max(latencies),
"avg_tokens": sum(tokens_generated) / len(tokens_generated),
"tokens_per_second": sum(tokens_generated) / sum(latencies)
}
def run_benchmark():
"""Run comprehensive benchmark."""
test_prompts = [
"Fix the bug in this Python function that calculates factorial",
"Optimize this code for better performance",
"Add error handling to this database query function"
]
results = {}
# Benchmark 1: Base model without optimizations
print("Benchmarking base model (no optimizations)...")
Config.ENABLE_QUANTIZATION = False
Config.USE_GPU_OFFLOAD = False
model_base, tokenizer = OptimizedModelLoader.load_model()
results["base"] = benchmark_inference(model_base, tokenizer, test_prompts, num_runs=5)
del model_base
torch.cuda.empty_cache()
# Benchmark 2: With 8-bit quantization
print("Benchmarking with 8-bit quantization...")
Config.ENABLE_QUANTIZATION = True
Config.USE_GPU_OFFLOAD = False
model_quant, tokenizer = OptimizedModelLoader.load_model()
results["quantized"] = benchmark_inference(model_quant, tokenizer, test_prompts, num_runs=5)
del model_quant
torch.cuda.empty_cache()
# Benchmark 3: With quantization + GPU offloading
print("Benchmarking with quantization + GPU offloading...")
Config.ENABLE_QUANTIZATION = True
Config.USE_GPU_OFFLOAD = True
model_opt, tokenizer = OptimizedModelLoader.load_model()
results["optimized"] = benchmark_inference(model_opt, tokenizer, test_prompts, num_runs=5)
# Calculate improvements
baseline = results["base"]["avg_latency"]
optimized = results["optimized"]["avg_latency"]
improvement = ((baseline - optimized) / baseline) * 100
results["improvement_percentage"] = improvement
# Save results
output_file = "benchmark_results.json"
with open(output_file, 'w') as f:
json.dump(results, f, indent=2)
# Print summary
print("\n" + "="*50)
print("BENCHMARK RESULTS")
print("="*50)
print(f"Base Model Latency: {results['base']['avg_latency']:.3f}s")
print(f"Optimized Latency: {results['optimized']['avg_latency']:.3f}s")
print(f"Improvement: {improvement:.1f}%")
print(f"Tokens/sec (base): {results['base']['tokens_per_second']:.1f}")
print(f"Tokens/sec (optimized): {results['optimized']['tokens_per_second']:.1f}")
print(f"\n✅ Results saved to {output_file}")
if __name__ == "__main__":
run_benchmark()