File size: 2,372 Bytes
8296d63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# dynamic_gpu_predictor.py
from dynamic_predictor import DynamicPredictor
from batch_size_optimizer import BatchSizeOptimizer
from workload_scheduler import WorkloadScheduler
from performance_monitor import PerformanceMonitor

class DynamicGPUPredictor:
    def __init__(self, model_path='models/gradient_boosting_model.joblib', 

                 memory_limit_mb=8000, num_gpus=1):
        # Initialize components
        self.predictor = DynamicPredictor(model_path)
        self.batch_optimizer = BatchSizeOptimizer(self.predictor, memory_limit_mb)
        self.scheduler = WorkloadScheduler(self.predictor, num_gpus)
        self.monitor = PerformanceMonitor(self.predictor)
        
    def predict_and_optimize(self, model_features):
        """Predict execution time and optimize batch size"""
        # Find optimal batch size
        optimal_batch = self.batch_optimizer.find_optimal_batch_size(model_features)
        
        # Update features with optimal batch size
        optimized_features = model_features.copy()
        optimized_features['batch_size'] = optimal_batch
        
        # Make prediction
        predicted_time = self.predictor.predict(optimized_features)
        
        return {
            'optimal_batch_size': optimal_batch,
            'predicted_execution_time': predicted_time,
            'estimated_memory_usage': self.batch_optimizer.estimate_memory_usage(
                model_features, optimal_batch)
        }
    
    def schedule_model(self, model_features, job_id, priority=1):
        """Schedule a model execution on available GPUs"""
        return self.scheduler.schedule_job(model_features, job_id, priority)
    
    def record_actual_performance(self, model_name, batch_size, predicted_time, actual_time):
        """Record actual performance and detect anomalies"""
        is_anomaly = self.monitor.record_performance(
            model_name, batch_size, predicted_time, actual_time)
        
        if is_anomaly:
            # Rebalance workload if anomaly detected
            self.scheduler.rebalance_workload()
            
        return is_anomaly
    
    def get_performance_insights(self, model_name, batch_size):
        """Get performance insights for a specific model"""
        return self.monitor.get_performance_trend(model_name, batch_size)