Spaces:
Sleeping
Sleeping
| """Benchmark script to measure and validate performance improvements.""" | |
| import time | |
| import json | |
| import torch | |
| from pathlib import Path | |
| import sys | |
| sys.path.append('..') | |
| from app.lora_model import OptimizedModelLoader | |
| from app.config import Config | |
| def benchmark_inference(model, tokenizer, prompts, num_runs=10): | |
| """Benchmark model inference speed.""" | |
| latencies = [] | |
| tokens_generated = [] | |
| for prompt in prompts: | |
| for _ in range(num_runs): | |
| start = time.time() | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_length=1024, | |
| temperature=0.3, | |
| num_return_sequences=1, | |
| pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id, | |
| ) | |
| latency = time.time() - start | |
| latencies.append(latency) | |
| tokens_generated.append(outputs.shape[1]) | |
| return { | |
| "avg_latency": sum(latencies) / len(latencies), | |
| "min_latency": min(latencies), | |
| "max_latency": max(latencies), | |
| "avg_tokens": sum(tokens_generated) / len(tokens_generated), | |
| "tokens_per_second": sum(tokens_generated) / sum(latencies) | |
| } | |
| def run_benchmark(): | |
| """Run comprehensive benchmark.""" | |
| test_prompts = [ | |
| "Fix the bug in this Python function that calculates factorial", | |
| "Optimize this code for better performance", | |
| "Add error handling to this database query function" | |
| ] | |
| results = {} | |
| # Benchmark 1: Base model without optimizations | |
| print("Benchmarking base model (no optimizations)...") | |
| Config.ENABLE_QUANTIZATION = False | |
| Config.USE_GPU_OFFLOAD = False | |
| model_base, tokenizer = OptimizedModelLoader.load_model() | |
| results["base"] = benchmark_inference(model_base, tokenizer, test_prompts, num_runs=5) | |
| del model_base | |
| torch.cuda.empty_cache() | |
| # Benchmark 2: With 8-bit quantization | |
| print("Benchmarking with 8-bit quantization...") | |
| Config.ENABLE_QUANTIZATION = True | |
| Config.USE_GPU_OFFLOAD = False | |
| model_quant, tokenizer = OptimizedModelLoader.load_model() | |
| results["quantized"] = benchmark_inference(model_quant, tokenizer, test_prompts, num_runs=5) | |
| del model_quant | |
| torch.cuda.empty_cache() | |
| # Benchmark 3: With quantization + GPU offloading | |
| print("Benchmarking with quantization + GPU offloading...") | |
| Config.ENABLE_QUANTIZATION = True | |
| Config.USE_GPU_OFFLOAD = True | |
| model_opt, tokenizer = OptimizedModelLoader.load_model() | |
| results["optimized"] = benchmark_inference(model_opt, tokenizer, test_prompts, num_runs=5) | |
| # Calculate improvements | |
| baseline = results["base"]["avg_latency"] | |
| optimized = results["optimized"]["avg_latency"] | |
| improvement = ((baseline - optimized) / baseline) * 100 | |
| results["improvement_percentage"] = improvement | |
| # Save results | |
| output_file = "benchmark_results.json" | |
| with open(output_file, 'w') as f: | |
| json.dump(results, f, indent=2) | |
| # Print summary | |
| print("\n" + "="*50) | |
| print("BENCHMARK RESULTS") | |
| print("="*50) | |
| print(f"Base Model Latency: {results['base']['avg_latency']:.3f}s") | |
| print(f"Optimized Latency: {results['optimized']['avg_latency']:.3f}s") | |
| print(f"Improvement: {improvement:.1f}%") | |
| print(f"Tokens/sec (base): {results['base']['tokens_per_second']:.1f}") | |
| print(f"Tokens/sec (optimized): {results['optimized']['tokens_per_second']:.1f}") | |
| print(f"\n✅ Results saved to {output_file}") | |
| if __name__ == "__main__": | |
| run_benchmark() | |