Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import pickle | |
| import os | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score | |
| from sklearn.model_selection import cross_val_score | |
| import xgboost as xgb | |
| import sys | |
| import os | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from utils.preprocess import FlightDataPreprocessor | |
| from utils.features import FeatureEngineer | |
| class FlightDelayModel: | |
| def __init__(self): | |
| self.preprocessor = FlightDataPreprocessor() | |
| self.feature_engineer = FeatureEngineer() | |
| self.model = None | |
| self.feature_names = None | |
| self.model_metrics = {} | |
| def train_model(self): | |
| """Train the flight delay prediction model""" | |
| print("Loading and preprocessing data...") | |
| # Get processed data | |
| data, airlines_mapping, airports_mapping, weather_data = self.preprocessor.get_processed_data() | |
| if data is None: | |
| print("Failed to load data") | |
| return False | |
| print(f"Data shape: {data.shape}") | |
| print(f"Delay rate: {data['IS_DELAYED'].mean():.2%}") | |
| # Prepare data for modeling | |
| print("Engineering features...") | |
| X_train, X_test, y_train, y_test, feature_names = self.feature_engineer.prepare_data_for_modeling(data) | |
| self.feature_names = feature_names | |
| print(f"Training features: {len(feature_names)}") | |
| print(f"Training set size: {X_train.shape[0]}") | |
| print(f"Test set size: {X_test.shape[0]}") | |
| # Train model - using XGBoost for better performance | |
| print("Training XGBoost model...") | |
| self.model = xgb.XGBClassifier( | |
| n_estimators=200, | |
| max_depth=6, | |
| learning_rate=0.1, | |
| random_state=42, | |
| n_jobs=-1, | |
| eval_metric='logloss' | |
| ) | |
| self.model.fit(X_train, y_train) | |
| # Evaluate model | |
| print("Evaluating model...") | |
| y_pred = self.model.predict(X_test) | |
| y_pred_proba = self.model.predict_proba(X_test)[:, 1] | |
| # Calculate metrics | |
| accuracy = accuracy_score(y_test, y_pred) | |
| auc_score = roc_auc_score(y_test, y_pred_proba) | |
| self.model_metrics = { | |
| 'accuracy': accuracy, | |
| 'auc': auc_score, | |
| 'classification_report': classification_report(y_test, y_pred), | |
| 'confusion_matrix': confusion_matrix(y_test, y_pred) | |
| } | |
| print(f"Accuracy: {accuracy:.3f}") | |
| print(f"AUC Score: {auc_score:.3f}") | |
| # Feature importance | |
| feature_importance = pd.DataFrame({ | |
| 'feature': feature_names, | |
| 'importance': self.model.feature_importances_ | |
| }).sort_values('importance', ascending=False) | |
| print("\nTop 10 Important Features:") | |
| print(feature_importance.head(10)) | |
| # Save model and artifacts | |
| self.save_model(airlines_mapping, airports_mapping, feature_importance) | |
| return True | |
| def save_model(self, airlines_mapping, airports_mapping, feature_importance): | |
| """Save the trained model and related artifacts""" | |
| model_dir = 'model' | |
| # Save model | |
| with open(os.path.join(model_dir, 'model.pkl'), 'wb') as f: | |
| pickle.dump(self.model, f) | |
| # Save feature engineer | |
| with open(os.path.join(model_dir, 'feature_engineer.pkl'), 'wb') as f: | |
| pickle.dump(self.feature_engineer, f) | |
| # Save mappings | |
| with open(os.path.join(model_dir, 'airlines_mapping.pkl'), 'wb') as f: | |
| pickle.dump(airlines_mapping, f) | |
| with open(os.path.join(model_dir, 'airports_mapping.pkl'), 'wb') as f: | |
| pickle.dump(airports_mapping, f) | |
| # Save feature names | |
| with open(os.path.join(model_dir, 'feature_names.pkl'), 'wb') as f: | |
| pickle.dump(self.feature_names, f) | |
| # Save feature importance | |
| feature_importance.to_csv(os.path.join(model_dir, 'feature_importance.csv'), index=False) | |
| # Save metrics | |
| with open(os.path.join(model_dir, 'metrics.pkl'), 'wb') as f: | |
| pickle.dump(self.model_metrics, f) | |
| print(f"Model and artifacts saved to {model_dir}/") | |
| def load_model(self): | |
| """Load the trained model""" | |
| model_dir = 'model' | |
| try: | |
| with open(os.path.join(model_dir, 'model.pkl'), 'rb') as f: | |
| self.model = pickle.load(f) | |
| with open(os.path.join(model_dir, 'feature_engineer.pkl'), 'rb') as f: | |
| self.feature_engineer = pickle.load(f) | |
| with open(os.path.join(model_dir, 'feature_names.pkl'), 'rb') as f: | |
| self.feature_names = pickle.load(f) | |
| with open(os.path.join(model_dir, 'airlines_mapping.pkl'), 'rb') as f: | |
| airlines_mapping = pickle.load(f) | |
| with open(os.path.join(model_dir, 'airports_mapping.pkl'), 'rb') as f: | |
| airports_mapping = pickle.load(f) | |
| with open(os.path.join(model_dir, 'metrics.pkl'), 'rb') as f: | |
| self.model_metrics = pickle.load(f) | |
| print("Model loaded successfully") | |
| return True, airlines_mapping, airports_mapping | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| return False, None, None | |
| def predict_delay(self, input_data): | |
| """Predict flight delay probability""" | |
| if self.model is None: | |
| success, _, _ = self.load_model() | |
| if not success: | |
| return None | |
| # Prepare input data | |
| input_df = pd.DataFrame([input_data]) | |
| # Apply same preprocessing | |
| input_features = self.feature_engineer.create_features(input_df) | |
| input_encoded = self.feature_engineer.encode_categorical_features(input_features) | |
| X = self.feature_engineer.select_features(input_encoded) | |
| # Handle missing features | |
| for col in self.feature_names: | |
| if col not in X.columns: | |
| X[col] = 0 | |
| X = X[self.feature_names].fillna(0) | |
| # Scale features | |
| X_scaled = self.feature_engineer.scaler.transform(X) | |
| # Predict | |
| delay_probability = self.model.predict_proba(X_scaled)[0, 1] | |
| prediction = 1 if delay_probability > 0.5 else 0 | |
| return { | |
| 'delay_probability': delay_probability, | |
| 'prediction': prediction, | |
| 'prediction_label': 'Delayed' if prediction == 1 else 'On Time' | |
| } | |
| if __name__ == "__main__": | |
| # Train the model | |
| flight_model = FlightDelayModel() | |
| success = flight_model.train_model() | |
| if success: | |
| print("\nModel training completed successfully!") | |
| else: | |
| print("\nModel training failed!") | |