Flight-delay-Prediction / model /train_model.py
Zayeemk's picture
Rename train_model.py to model/train_model.py
3029f99 verified
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.preprocess import FlightDataPreprocessor
from utils.features import FeatureEngineer
class FlightDelayModel:
def __init__(self):
self.preprocessor = FlightDataPreprocessor()
self.feature_engineer = FeatureEngineer()
self.model = None
self.feature_names = None
self.model_metrics = {}
def train_model(self):
"""Train the flight delay prediction model"""
print("Loading and preprocessing data...")
# Get processed data
data, airlines_mapping, airports_mapping, weather_data = self.preprocessor.get_processed_data()
if data is None:
print("Failed to load data")
return False
print(f"Data shape: {data.shape}")
print(f"Delay rate: {data['IS_DELAYED'].mean():.2%}")
# Prepare data for modeling
print("Engineering features...")
X_train, X_test, y_train, y_test, feature_names = self.feature_engineer.prepare_data_for_modeling(data)
self.feature_names = feature_names
print(f"Training features: {len(feature_names)}")
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
# Train model - using XGBoost for better performance
print("Training XGBoost model...")
self.model = xgb.XGBClassifier(
n_estimators=200,
max_depth=6,
learning_rate=0.1,
random_state=42,
n_jobs=-1,
eval_metric='logloss'
)
self.model.fit(X_train, y_train)
# Evaluate model
print("Evaluating model...")
y_pred = self.model.predict(X_test)
y_pred_proba = self.model.predict_proba(X_test)[:, 1]
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred_proba)
self.model_metrics = {
'accuracy': accuracy,
'auc': auc_score,
'classification_report': classification_report(y_test, y_pred),
'confusion_matrix': confusion_matrix(y_test, y_pred)
}
print(f"Accuracy: {accuracy:.3f}")
print(f"AUC Score: {auc_score:.3f}")
# Feature importance
feature_importance = pd.DataFrame({
'feature': feature_names,
'importance': self.model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nTop 10 Important Features:")
print(feature_importance.head(10))
# Save model and artifacts
self.save_model(airlines_mapping, airports_mapping, feature_importance)
return True
def save_model(self, airlines_mapping, airports_mapping, feature_importance):
"""Save the trained model and related artifacts"""
model_dir = 'model'
# Save model
with open(os.path.join(model_dir, 'model.pkl'), 'wb') as f:
pickle.dump(self.model, f)
# Save feature engineer
with open(os.path.join(model_dir, 'feature_engineer.pkl'), 'wb') as f:
pickle.dump(self.feature_engineer, f)
# Save mappings
with open(os.path.join(model_dir, 'airlines_mapping.pkl'), 'wb') as f:
pickle.dump(airlines_mapping, f)
with open(os.path.join(model_dir, 'airports_mapping.pkl'), 'wb') as f:
pickle.dump(airports_mapping, f)
# Save feature names
with open(os.path.join(model_dir, 'feature_names.pkl'), 'wb') as f:
pickle.dump(self.feature_names, f)
# Save feature importance
feature_importance.to_csv(os.path.join(model_dir, 'feature_importance.csv'), index=False)
# Save metrics
with open(os.path.join(model_dir, 'metrics.pkl'), 'wb') as f:
pickle.dump(self.model_metrics, f)
print(f"Model and artifacts saved to {model_dir}/")
def load_model(self):
"""Load the trained model"""
model_dir = 'model'
try:
with open(os.path.join(model_dir, 'model.pkl'), 'rb') as f:
self.model = pickle.load(f)
with open(os.path.join(model_dir, 'feature_engineer.pkl'), 'rb') as f:
self.feature_engineer = pickle.load(f)
with open(os.path.join(model_dir, 'feature_names.pkl'), 'rb') as f:
self.feature_names = pickle.load(f)
with open(os.path.join(model_dir, 'airlines_mapping.pkl'), 'rb') as f:
airlines_mapping = pickle.load(f)
with open(os.path.join(model_dir, 'airports_mapping.pkl'), 'rb') as f:
airports_mapping = pickle.load(f)
with open(os.path.join(model_dir, 'metrics.pkl'), 'rb') as f:
self.model_metrics = pickle.load(f)
print("Model loaded successfully")
return True, airlines_mapping, airports_mapping
except Exception as e:
print(f"Error loading model: {e}")
return False, None, None
def predict_delay(self, input_data):
"""Predict flight delay probability"""
if self.model is None:
success, _, _ = self.load_model()
if not success:
return None
# Prepare input data
input_df = pd.DataFrame([input_data])
# Apply same preprocessing
input_features = self.feature_engineer.create_features(input_df)
input_encoded = self.feature_engineer.encode_categorical_features(input_features)
X = self.feature_engineer.select_features(input_encoded)
# Handle missing features
for col in self.feature_names:
if col not in X.columns:
X[col] = 0
X = X[self.feature_names].fillna(0)
# Scale features
X_scaled = self.feature_engineer.scaler.transform(X)
# Predict
delay_probability = self.model.predict_proba(X_scaled)[0, 1]
prediction = 1 if delay_probability > 0.5 else 0
return {
'delay_probability': delay_probability,
'prediction': prediction,
'prediction_label': 'Delayed' if prediction == 1 else 'On Time'
}
if __name__ == "__main__":
# Train the model
flight_model = FlightDelayModel()
success = flight_model.train_model()
if success:
print("\nModel training completed successfully!")
else:
print("\nModel training failed!")