File size: 7,375 Bytes
952f689
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.preprocess import FlightDataPreprocessor
from utils.features import FeatureEngineer

class FlightDelayModel:
    def __init__(self):
        self.preprocessor = FlightDataPreprocessor()
        self.feature_engineer = FeatureEngineer()
        self.model = None
        self.feature_names = None
        self.model_metrics = {}
        
    def train_model(self):
        """Train the flight delay prediction model"""
        print("Loading and preprocessing data...")
        
        # Get processed data
        data, airlines_mapping, airports_mapping, weather_data = self.preprocessor.get_processed_data()
        
        if data is None:
            print("Failed to load data")
            return False
        
        print(f"Data shape: {data.shape}")
        print(f"Delay rate: {data['IS_DELAYED'].mean():.2%}")
        
        # Prepare data for modeling
        print("Engineering features...")
        X_train, X_test, y_train, y_test, feature_names = self.feature_engineer.prepare_data_for_modeling(data)
        
        self.feature_names = feature_names
        
        print(f"Training features: {len(feature_names)}")
        print(f"Training set size: {X_train.shape[0]}")
        print(f"Test set size: {X_test.shape[0]}")
        
        # Train model - using XGBoost for better performance
        print("Training XGBoost model...")
        self.model = xgb.XGBClassifier(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.1,
            random_state=42,
            n_jobs=-1,
            eval_metric='logloss'
        )
        
        self.model.fit(X_train, y_train)
        
        # Evaluate model
        print("Evaluating model...")
        y_pred = self.model.predict(X_test)
        y_pred_proba = self.model.predict_proba(X_test)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        auc_score = roc_auc_score(y_test, y_pred_proba)
        
        self.model_metrics = {
            'accuracy': accuracy,
            'auc': auc_score,
            'classification_report': classification_report(y_test, y_pred),
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }
        
        print(f"Accuracy: {accuracy:.3f}")
        print(f"AUC Score: {auc_score:.3f}")
        
        # Feature importance
        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("\nTop 10 Important Features:")
        print(feature_importance.head(10))
        
        # Save model and artifacts
        self.save_model(airlines_mapping, airports_mapping, feature_importance)
        
        return True
    
    def save_model(self, airlines_mapping, airports_mapping, feature_importance):
        """Save the trained model and related artifacts"""
        model_dir = 'model'
        
        # Save model
        with open(os.path.join(model_dir, 'model.pkl'), 'wb') as f:
            pickle.dump(self.model, f)
        
        # Save feature engineer
        with open(os.path.join(model_dir, 'feature_engineer.pkl'), 'wb') as f:
            pickle.dump(self.feature_engineer, f)
        
        # Save mappings
        with open(os.path.join(model_dir, 'airlines_mapping.pkl'), 'wb') as f:
            pickle.dump(airlines_mapping, f)
        
        with open(os.path.join(model_dir, 'airports_mapping.pkl'), 'wb') as f:
            pickle.dump(airports_mapping, f)
        
        # Save feature names
        with open(os.path.join(model_dir, 'feature_names.pkl'), 'wb') as f:
            pickle.dump(self.feature_names, f)
        
        # Save feature importance
        feature_importance.to_csv(os.path.join(model_dir, 'feature_importance.csv'), index=False)
        
        # Save metrics
        with open(os.path.join(model_dir, 'metrics.pkl'), 'wb') as f:
            pickle.dump(self.model_metrics, f)
        
        print(f"Model and artifacts saved to {model_dir}/")
    
    def load_model(self):
        """Load the trained model"""
        model_dir = 'model'
        
        try:
            with open(os.path.join(model_dir, 'model.pkl'), 'rb') as f:
                self.model = pickle.load(f)
            
            with open(os.path.join(model_dir, 'feature_engineer.pkl'), 'rb') as f:
                self.feature_engineer = pickle.load(f)
            
            with open(os.path.join(model_dir, 'feature_names.pkl'), 'rb') as f:
                self.feature_names = pickle.load(f)
            
            with open(os.path.join(model_dir, 'airlines_mapping.pkl'), 'rb') as f:
                airlines_mapping = pickle.load(f)
            
            with open(os.path.join(model_dir, 'airports_mapping.pkl'), 'rb') as f:
                airports_mapping = pickle.load(f)
            
            with open(os.path.join(model_dir, 'metrics.pkl'), 'rb') as f:
                self.model_metrics = pickle.load(f)
            
            print("Model loaded successfully")
            return True, airlines_mapping, airports_mapping
            
        except Exception as e:
            print(f"Error loading model: {e}")
            return False, None, None
    
    def predict_delay(self, input_data):
        """Predict flight delay probability"""
        if self.model is None:
            success, _, _ = self.load_model()
            if not success:
                return None
        
        # Prepare input data
        input_df = pd.DataFrame([input_data])
        
        # Apply same preprocessing
        input_features = self.feature_engineer.create_features(input_df)
        input_encoded = self.feature_engineer.encode_categorical_features(input_features)
        X = self.feature_engineer.select_features(input_encoded)
        
        # Handle missing features
        for col in self.feature_names:
            if col not in X.columns:
                X[col] = 0
        
        X = X[self.feature_names].fillna(0)
        
        # Scale features
        X_scaled = self.feature_engineer.scaler.transform(X)
        
        # Predict
        delay_probability = self.model.predict_proba(X_scaled)[0, 1]
        prediction = 1 if delay_probability > 0.5 else 0
        
        return {
            'delay_probability': delay_probability,
            'prediction': prediction,
            'prediction_label': 'Delayed' if prediction == 1 else 'On Time'
        }

if __name__ == "__main__":
    # Train the model
    flight_model = FlightDelayModel()
    success = flight_model.train_model()
    
    if success:
        print("\nModel training completed successfully!")
    else:
        print("\nModel training failed!")