import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.model_selection import train_test_split class FeatureEngineer: def __init__(self): self.scaler = StandardScaler() self.label_encoders = {} self.feature_columns = [] def create_features(self, df): """Create engineered features from the dataset""" df_features = df.copy() # Time-based features df_features['IS_WEEKEND'] = (df_features['DAY_OF_WEEK'] >= 6).astype(int) df_features['IS_MORNING_RUSH'] = ((df_features['DEPARTURE_HOUR'] >= 6) & (df_features['DEPARTURE_HOUR'] <= 9)).astype(int) df_features['IS_EVENING_RUSH'] = ((df_features['DEPARTURE_HOUR'] >= 17) & (df_features['DEPARTURE_HOUR'] <= 20)).astype(int) df_features['IS_NIGHT'] = ((df_features['DEPARTURE_HOUR'] >= 22) | (df_features['DEPARTURE_HOUR'] <= 5)).astype(int) # Weather interaction features df_features['BAD_WEATHER'] = df_features['WEATHER_CATEGORY'].apply( lambda x: 1 if x in ['rain', 'snow', 'storm'] else 0 ) # Distance categories df_features['DISTANCE_CATEGORY'] = pd.cut(df_features['DISTANCE'], bins=[0, 500, 1500, 3000, np.inf], labels=['Short', 'Medium', 'Long', 'Very Long']) # Airline popularity (route frequency) route_counts = df_features.groupby(['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']).size() df_features['ROUTE_FREQUENCY'] = df_features.set_index(['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']).index.map(route_counts) df_features['ROUTE_FREQUENCY'] = df_features['ROUTE_FREQUENCY'].fillna(1) # Airport busyness origin_counts = df_features['ORIGIN_AIRPORT'].value_counts() dest_counts = df_features['DESTINATION_AIRPORT'].value_counts() df_features['ORIGIN_BUSYNESS'] = df_features['ORIGIN_AIRPORT'].map(origin_counts) df_features['DESTINATION_BUSYNESS'] = df_features['DESTINATION_AIRPORT'].map(dest_counts) # Weather severity score df_features['WEATHER_SEVERITY'] = df_features['WEATHER_CATEGORY'].map({ 'clear': 0, 'clouds': 1, 'rain': 2, 'snow': 3, 'storm': 4, 'other': 1 }).fillna(1) # Temperature categories df_features['TEMP_CATEGORY'] = pd.cut(df_features['TEMP_C'], bins=[-np.inf, 0, 15, 25, np.inf], labels=['Freezing', 'Cold', 'Mild', 'Hot']) # Wind speed categories df_features['WIND_CATEGORY'] = pd.cut(df_features['WIND_SPEED'], bins=[0, 5, 15, 25, np.inf], labels=['Calm', 'Moderate', 'Strong', 'Severe']) return df_features def encode_categorical_features(self, df): """Encode categorical features""" df_encoded = df.copy() # Identify categorical columns categorical_columns = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SEASON', 'WEATHER_CATEGORY', 'DISTANCE_CATEGORY', 'TEMP_CATEGORY', 'WIND_CATEGORY'] # Add categorical columns that exist in dataframe existing_categorical = [col for col in categorical_columns if col in df_encoded.columns] for col in existing_categorical: if col not in self.label_encoders: self.label_encoders[col] = LabelEncoder() df_encoded[col + '_ENCODED'] = self.label_encoders[col].fit_transform(df_encoded[col].astype(str)) else: # Handle unseen labels by mapping them to a default value unique_values = set(df_encoded[col].astype(str).unique()) known_values = set(self.label_encoders[col].classes_) unseen_values = unique_values - known_values if unseen_values: # Create a mapping for unseen values to 0 (or any default) temp_series = df_encoded[col].astype(str).copy() for unseen in unseen_values: temp_series[temp_series == unseen] = self.label_encoders[col].classes_[0] # Map to first known class df_encoded[col + '_ENCODED'] = self.label_encoders[col].transform(temp_series) else: df_encoded[col + '_ENCODED'] = self.label_encoders[col].transform(df_encoded[col].astype(str)) return df_encoded def select_features(self, df): """Select final features for modeling""" # Define feature columns numeric_features = [ 'MONTH', 'DAY_OF_WEEK', 'DEPARTURE_HOUR', 'DISTANCE', 'TEMP_C', 'HUMIDITY', 'WIND_SPEED', 'ROUTE_FREQUENCY', 'ORIGIN_BUSYNESS', 'DESTINATION_BUSYNESS', 'WEATHER_SEVERITY' ] binary_features = [ 'IS_WEEKEND', 'IS_MORNING_RUSH', 'IS_EVENING_RUSH', 'IS_NIGHT', 'BAD_WEATHER' ] # Encoded categorical features encoded_categorical = [col for col in df.columns if col.endswith('_ENCODED')] # Combine all features all_features = [] for feature_list in [numeric_features, binary_features, encoded_categorical]: all_features.extend([col for col in feature_list if col in df.columns]) self.feature_columns = all_features return df[all_features] def prepare_data_for_modeling(self, df): """Prepare data for machine learning""" # Create features df_features = self.create_features(df) # Encode categorical features df_encoded = self.encode_categorical_features(df_features) # Select features X = self.select_features(df_encoded) y = df_encoded['IS_DELAYED'] # Handle any remaining missing values X = X.fillna(X.mean()) # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Scale features X_train_scaled = self.scaler.fit_transform(X_train) X_test_scaled = self.scaler.transform(X_test) return X_train_scaled, X_test_scaled, y_train, y_test, X.columns.tolist() def get_feature_importance_data(self, df): """Get data for feature importance analysis""" df_features = self.create_features(df) df_encoded = self.encode_categorical_features(df_features) X = self.select_features(df_encoded) return X, df_encoded['IS_DELAYED']