Spaces:
Sleeping
Sleeping
File size: 7,375 Bytes
952f689 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | import pandas as pd
import numpy as np
import pickle
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.preprocess import FlightDataPreprocessor
from utils.features import FeatureEngineer
class FlightDelayModel:
def __init__(self):
self.preprocessor = FlightDataPreprocessor()
self.feature_engineer = FeatureEngineer()
self.model = None
self.feature_names = None
self.model_metrics = {}
def train_model(self):
"""Train the flight delay prediction model"""
print("Loading and preprocessing data...")
# Get processed data
data, airlines_mapping, airports_mapping, weather_data = self.preprocessor.get_processed_data()
if data is None:
print("Failed to load data")
return False
print(f"Data shape: {data.shape}")
print(f"Delay rate: {data['IS_DELAYED'].mean():.2%}")
# Prepare data for modeling
print("Engineering features...")
X_train, X_test, y_train, y_test, feature_names = self.feature_engineer.prepare_data_for_modeling(data)
self.feature_names = feature_names
print(f"Training features: {len(feature_names)}")
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
# Train model - using XGBoost for better performance
print("Training XGBoost model...")
self.model = xgb.XGBClassifier(
n_estimators=200,
max_depth=6,
learning_rate=0.1,
random_state=42,
n_jobs=-1,
eval_metric='logloss'
)
self.model.fit(X_train, y_train)
# Evaluate model
print("Evaluating model...")
y_pred = self.model.predict(X_test)
y_pred_proba = self.model.predict_proba(X_test)[:, 1]
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred_proba)
self.model_metrics = {
'accuracy': accuracy,
'auc': auc_score,
'classification_report': classification_report(y_test, y_pred),
'confusion_matrix': confusion_matrix(y_test, y_pred)
}
print(f"Accuracy: {accuracy:.3f}")
print(f"AUC Score: {auc_score:.3f}")
# Feature importance
feature_importance = pd.DataFrame({
'feature': feature_names,
'importance': self.model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nTop 10 Important Features:")
print(feature_importance.head(10))
# Save model and artifacts
self.save_model(airlines_mapping, airports_mapping, feature_importance)
return True
def save_model(self, airlines_mapping, airports_mapping, feature_importance):
"""Save the trained model and related artifacts"""
model_dir = 'model'
# Save model
with open(os.path.join(model_dir, 'model.pkl'), 'wb') as f:
pickle.dump(self.model, f)
# Save feature engineer
with open(os.path.join(model_dir, 'feature_engineer.pkl'), 'wb') as f:
pickle.dump(self.feature_engineer, f)
# Save mappings
with open(os.path.join(model_dir, 'airlines_mapping.pkl'), 'wb') as f:
pickle.dump(airlines_mapping, f)
with open(os.path.join(model_dir, 'airports_mapping.pkl'), 'wb') as f:
pickle.dump(airports_mapping, f)
# Save feature names
with open(os.path.join(model_dir, 'feature_names.pkl'), 'wb') as f:
pickle.dump(self.feature_names, f)
# Save feature importance
feature_importance.to_csv(os.path.join(model_dir, 'feature_importance.csv'), index=False)
# Save metrics
with open(os.path.join(model_dir, 'metrics.pkl'), 'wb') as f:
pickle.dump(self.model_metrics, f)
print(f"Model and artifacts saved to {model_dir}/")
def load_model(self):
"""Load the trained model"""
model_dir = 'model'
try:
with open(os.path.join(model_dir, 'model.pkl'), 'rb') as f:
self.model = pickle.load(f)
with open(os.path.join(model_dir, 'feature_engineer.pkl'), 'rb') as f:
self.feature_engineer = pickle.load(f)
with open(os.path.join(model_dir, 'feature_names.pkl'), 'rb') as f:
self.feature_names = pickle.load(f)
with open(os.path.join(model_dir, 'airlines_mapping.pkl'), 'rb') as f:
airlines_mapping = pickle.load(f)
with open(os.path.join(model_dir, 'airports_mapping.pkl'), 'rb') as f:
airports_mapping = pickle.load(f)
with open(os.path.join(model_dir, 'metrics.pkl'), 'rb') as f:
self.model_metrics = pickle.load(f)
print("Model loaded successfully")
return True, airlines_mapping, airports_mapping
except Exception as e:
print(f"Error loading model: {e}")
return False, None, None
def predict_delay(self, input_data):
"""Predict flight delay probability"""
if self.model is None:
success, _, _ = self.load_model()
if not success:
return None
# Prepare input data
input_df = pd.DataFrame([input_data])
# Apply same preprocessing
input_features = self.feature_engineer.create_features(input_df)
input_encoded = self.feature_engineer.encode_categorical_features(input_features)
X = self.feature_engineer.select_features(input_encoded)
# Handle missing features
for col in self.feature_names:
if col not in X.columns:
X[col] = 0
X = X[self.feature_names].fillna(0)
# Scale features
X_scaled = self.feature_engineer.scaler.transform(X)
# Predict
delay_probability = self.model.predict_proba(X_scaled)[0, 1]
prediction = 1 if delay_probability > 0.5 else 0
return {
'delay_probability': delay_probability,
'prediction': prediction,
'prediction_label': 'Delayed' if prediction == 1 else 'On Time'
}
if __name__ == "__main__":
# Train the model
flight_model = FlightDelayModel()
success = flight_model.train_model()
if success:
print("\nModel training completed successfully!")
else:
print("\nModel training failed!")
|