import pandas as pd import numpy as np import pickle import os from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score from sklearn.model_selection import cross_val_score import xgboost as xgb import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils.preprocess import FlightDataPreprocessor from utils.features import FeatureEngineer class FlightDelayModel: def __init__(self): self.preprocessor = FlightDataPreprocessor() self.feature_engineer = FeatureEngineer() self.model = None self.feature_names = None self.model_metrics = {} def train_model(self): """Train the flight delay prediction model""" print("Loading and preprocessing data...") # Get processed data data, airlines_mapping, airports_mapping, weather_data = self.preprocessor.get_processed_data() if data is None: print("Failed to load data") return False print(f"Data shape: {data.shape}") print(f"Delay rate: {data['IS_DELAYED'].mean():.2%}") # Prepare data for modeling print("Engineering features...") X_train, X_test, y_train, y_test, feature_names = self.feature_engineer.prepare_data_for_modeling(data) self.feature_names = feature_names print(f"Training features: {len(feature_names)}") print(f"Training set size: {X_train.shape[0]}") print(f"Test set size: {X_test.shape[0]}") # Train model - using XGBoost for better performance print("Training XGBoost model...") self.model = xgb.XGBClassifier( n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1, eval_metric='logloss' ) self.model.fit(X_train, y_train) # Evaluate model print("Evaluating model...") y_pred = self.model.predict(X_test) y_pred_proba = self.model.predict_proba(X_test)[:, 1] # Calculate metrics accuracy = accuracy_score(y_test, y_pred) auc_score = roc_auc_score(y_test, y_pred_proba) self.model_metrics = { 'accuracy': accuracy, 'auc': auc_score, 'classification_report': classification_report(y_test, y_pred), 'confusion_matrix': confusion_matrix(y_test, y_pred) } print(f"Accuracy: {accuracy:.3f}") print(f"AUC Score: {auc_score:.3f}") # Feature importance feature_importance = pd.DataFrame({ 'feature': feature_names, 'importance': self.model.feature_importances_ }).sort_values('importance', ascending=False) print("\nTop 10 Important Features:") print(feature_importance.head(10)) # Save model and artifacts self.save_model(airlines_mapping, airports_mapping, feature_importance) return True def save_model(self, airlines_mapping, airports_mapping, feature_importance): """Save the trained model and related artifacts""" model_dir = 'model' # Save model with open(os.path.join(model_dir, 'model.pkl'), 'wb') as f: pickle.dump(self.model, f) # Save feature engineer with open(os.path.join(model_dir, 'feature_engineer.pkl'), 'wb') as f: pickle.dump(self.feature_engineer, f) # Save mappings with open(os.path.join(model_dir, 'airlines_mapping.pkl'), 'wb') as f: pickle.dump(airlines_mapping, f) with open(os.path.join(model_dir, 'airports_mapping.pkl'), 'wb') as f: pickle.dump(airports_mapping, f) # Save feature names with open(os.path.join(model_dir, 'feature_names.pkl'), 'wb') as f: pickle.dump(self.feature_names, f) # Save feature importance feature_importance.to_csv(os.path.join(model_dir, 'feature_importance.csv'), index=False) # Save metrics with open(os.path.join(model_dir, 'metrics.pkl'), 'wb') as f: pickle.dump(self.model_metrics, f) print(f"Model and artifacts saved to {model_dir}/") def load_model(self): """Load the trained model""" model_dir = 'model' try: with open(os.path.join(model_dir, 'model.pkl'), 'rb') as f: self.model = pickle.load(f) with open(os.path.join(model_dir, 'feature_engineer.pkl'), 'rb') as f: self.feature_engineer = pickle.load(f) with open(os.path.join(model_dir, 'feature_names.pkl'), 'rb') as f: self.feature_names = pickle.load(f) with open(os.path.join(model_dir, 'airlines_mapping.pkl'), 'rb') as f: airlines_mapping = pickle.load(f) with open(os.path.join(model_dir, 'airports_mapping.pkl'), 'rb') as f: airports_mapping = pickle.load(f) with open(os.path.join(model_dir, 'metrics.pkl'), 'rb') as f: self.model_metrics = pickle.load(f) print("Model loaded successfully") return True, airlines_mapping, airports_mapping except Exception as e: print(f"Error loading model: {e}") return False, None, None def predict_delay(self, input_data): """Predict flight delay probability""" if self.model is None: success, _, _ = self.load_model() if not success: return None # Prepare input data input_df = pd.DataFrame([input_data]) # Apply same preprocessing input_features = self.feature_engineer.create_features(input_df) input_encoded = self.feature_engineer.encode_categorical_features(input_features) X = self.feature_engineer.select_features(input_encoded) # Handle missing features for col in self.feature_names: if col not in X.columns: X[col] = 0 X = X[self.feature_names].fillna(0) # Scale features X_scaled = self.feature_engineer.scaler.transform(X) # Predict delay_probability = self.model.predict_proba(X_scaled)[0, 1] prediction = 1 if delay_probability > 0.5 else 0 return { 'delay_probability': delay_probability, 'prediction': prediction, 'prediction_label': 'Delayed' if prediction == 1 else 'On Time' } if __name__ == "__main__": # Train the model flight_model = FlightDelayModel() success = flight_model.train_model() if success: print("\nModel training completed successfully!") else: print("\nModel training failed!")