File size: 3,829 Bytes
04653e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""Benchmark script to measure and validate performance improvements."""
import time
import json
import torch
from pathlib import Path
import sys
sys.path.append('..')
from app.lora_model import OptimizedModelLoader
from app.config import Config


def benchmark_inference(model, tokenizer, prompts, num_runs=10):
    """Benchmark model inference speed."""
    latencies = []
    tokens_generated = []
    
    for prompt in prompts:
        for _ in range(num_runs):
            start = time.time()
            
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_length=1024,
                    temperature=0.3,
                    num_return_sequences=1,
                    pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
                )
            
            latency = time.time() - start
            latencies.append(latency)
            tokens_generated.append(outputs.shape[1])
    
    return {
        "avg_latency": sum(latencies) / len(latencies),
        "min_latency": min(latencies),
        "max_latency": max(latencies),
        "avg_tokens": sum(tokens_generated) / len(tokens_generated),
        "tokens_per_second": sum(tokens_generated) / sum(latencies)
    }


def run_benchmark():
    """Run comprehensive benchmark."""
    test_prompts = [
        "Fix the bug in this Python function that calculates factorial",
        "Optimize this code for better performance",
        "Add error handling to this database query function"
    ]
    
    results = {}
    
    # Benchmark 1: Base model without optimizations
    print("Benchmarking base model (no optimizations)...")
    Config.ENABLE_QUANTIZATION = False
    Config.USE_GPU_OFFLOAD = False
    model_base, tokenizer = OptimizedModelLoader.load_model()
    results["base"] = benchmark_inference(model_base, tokenizer, test_prompts, num_runs=5)
    del model_base
    torch.cuda.empty_cache()
    
    # Benchmark 2: With 8-bit quantization
    print("Benchmarking with 8-bit quantization...")
    Config.ENABLE_QUANTIZATION = True
    Config.USE_GPU_OFFLOAD = False
    model_quant, tokenizer = OptimizedModelLoader.load_model()
    results["quantized"] = benchmark_inference(model_quant, tokenizer, test_prompts, num_runs=5)
    del model_quant
    torch.cuda.empty_cache()
    
    # Benchmark 3: With quantization + GPU offloading
    print("Benchmarking with quantization + GPU offloading...")
    Config.ENABLE_QUANTIZATION = True
    Config.USE_GPU_OFFLOAD = True
    model_opt, tokenizer = OptimizedModelLoader.load_model()
    results["optimized"] = benchmark_inference(model_opt, tokenizer, test_prompts, num_runs=5)
    
    # Calculate improvements
    baseline = results["base"]["avg_latency"]
    optimized = results["optimized"]["avg_latency"]
    improvement = ((baseline - optimized) / baseline) * 100
    
    results["improvement_percentage"] = improvement
    
    # Save results
    output_file = "benchmark_results.json"
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    # Print summary
    print("\n" + "="*50)
    print("BENCHMARK RESULTS")
    print("="*50)
    print(f"Base Model Latency: {results['base']['avg_latency']:.3f}s")
    print(f"Optimized Latency: {results['optimized']['avg_latency']:.3f}s")
    print(f"Improvement: {improvement:.1f}%")
    print(f"Tokens/sec (base): {results['base']['tokens_per_second']:.1f}")
    print(f"Tokens/sec (optimized): {results['optimized']['tokens_per_second']:.1f}")
    print(f"\n✅ Results saved to {output_file}")


if __name__ == "__main__":
    run_benchmark()