Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| from sklearn.model_selection import train_test_split | |
| class FeatureEngineer: | |
| def __init__(self): | |
| self.scaler = StandardScaler() | |
| self.label_encoders = {} | |
| self.feature_columns = [] | |
| def create_features(self, df): | |
| """Create engineered features from the dataset""" | |
| df_features = df.copy() | |
| # Time-based features | |
| df_features['IS_WEEKEND'] = (df_features['DAY_OF_WEEK'] >= 6).astype(int) | |
| df_features['IS_MORNING_RUSH'] = ((df_features['DEPARTURE_HOUR'] >= 6) & | |
| (df_features['DEPARTURE_HOUR'] <= 9)).astype(int) | |
| df_features['IS_EVENING_RUSH'] = ((df_features['DEPARTURE_HOUR'] >= 17) & | |
| (df_features['DEPARTURE_HOUR'] <= 20)).astype(int) | |
| df_features['IS_NIGHT'] = ((df_features['DEPARTURE_HOUR'] >= 22) | | |
| (df_features['DEPARTURE_HOUR'] <= 5)).astype(int) | |
| # Weather interaction features | |
| df_features['BAD_WEATHER'] = df_features['WEATHER_CATEGORY'].apply( | |
| lambda x: 1 if x in ['rain', 'snow', 'storm'] else 0 | |
| ) | |
| # Distance categories | |
| df_features['DISTANCE_CATEGORY'] = pd.cut(df_features['DISTANCE'], | |
| bins=[0, 500, 1500, 3000, np.inf], | |
| labels=['Short', 'Medium', 'Long', 'Very Long']) | |
| # Airline popularity (route frequency) | |
| route_counts = df_features.groupby(['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']).size() | |
| df_features['ROUTE_FREQUENCY'] = df_features.set_index(['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']).index.map(route_counts) | |
| df_features['ROUTE_FREQUENCY'] = df_features['ROUTE_FREQUENCY'].fillna(1) | |
| # Airport busyness | |
| origin_counts = df_features['ORIGIN_AIRPORT'].value_counts() | |
| dest_counts = df_features['DESTINATION_AIRPORT'].value_counts() | |
| df_features['ORIGIN_BUSYNESS'] = df_features['ORIGIN_AIRPORT'].map(origin_counts) | |
| df_features['DESTINATION_BUSYNESS'] = df_features['DESTINATION_AIRPORT'].map(dest_counts) | |
| # Weather severity score | |
| df_features['WEATHER_SEVERITY'] = df_features['WEATHER_CATEGORY'].map({ | |
| 'clear': 0, 'clouds': 1, 'rain': 2, 'snow': 3, 'storm': 4, 'other': 1 | |
| }).fillna(1) | |
| # Temperature categories | |
| df_features['TEMP_CATEGORY'] = pd.cut(df_features['TEMP_C'], | |
| bins=[-np.inf, 0, 15, 25, np.inf], | |
| labels=['Freezing', 'Cold', 'Mild', 'Hot']) | |
| # Wind speed categories | |
| df_features['WIND_CATEGORY'] = pd.cut(df_features['WIND_SPEED'], | |
| bins=[0, 5, 15, 25, np.inf], | |
| labels=['Calm', 'Moderate', 'Strong', 'Severe']) | |
| return df_features | |
| def encode_categorical_features(self, df): | |
| """Encode categorical features""" | |
| df_encoded = df.copy() | |
| # Identify categorical columns | |
| categorical_columns = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', | |
| 'SEASON', 'WEATHER_CATEGORY', 'DISTANCE_CATEGORY', | |
| 'TEMP_CATEGORY', 'WIND_CATEGORY'] | |
| # Add categorical columns that exist in dataframe | |
| existing_categorical = [col for col in categorical_columns if col in df_encoded.columns] | |
| for col in existing_categorical: | |
| if col not in self.label_encoders: | |
| self.label_encoders[col] = LabelEncoder() | |
| df_encoded[col + '_ENCODED'] = self.label_encoders[col].fit_transform(df_encoded[col].astype(str)) | |
| else: | |
| # Handle unseen labels by mapping them to a default value | |
| unique_values = set(df_encoded[col].astype(str).unique()) | |
| known_values = set(self.label_encoders[col].classes_) | |
| unseen_values = unique_values - known_values | |
| if unseen_values: | |
| # Create a mapping for unseen values to 0 (or any default) | |
| temp_series = df_encoded[col].astype(str).copy() | |
| for unseen in unseen_values: | |
| temp_series[temp_series == unseen] = self.label_encoders[col].classes_[0] # Map to first known class | |
| df_encoded[col + '_ENCODED'] = self.label_encoders[col].transform(temp_series) | |
| else: | |
| df_encoded[col + '_ENCODED'] = self.label_encoders[col].transform(df_encoded[col].astype(str)) | |
| return df_encoded | |
| def select_features(self, df): | |
| """Select final features for modeling""" | |
| # Define feature columns | |
| numeric_features = [ | |
| 'MONTH', 'DAY_OF_WEEK', 'DEPARTURE_HOUR', 'DISTANCE', | |
| 'TEMP_C', 'HUMIDITY', 'WIND_SPEED', 'ROUTE_FREQUENCY', | |
| 'ORIGIN_BUSYNESS', 'DESTINATION_BUSYNESS', 'WEATHER_SEVERITY' | |
| ] | |
| binary_features = [ | |
| 'IS_WEEKEND', 'IS_MORNING_RUSH', 'IS_EVENING_RUSH', | |
| 'IS_NIGHT', 'BAD_WEATHER' | |
| ] | |
| # Encoded categorical features | |
| encoded_categorical = [col for col in df.columns if col.endswith('_ENCODED')] | |
| # Combine all features | |
| all_features = [] | |
| for feature_list in [numeric_features, binary_features, encoded_categorical]: | |
| all_features.extend([col for col in feature_list if col in df.columns]) | |
| self.feature_columns = all_features | |
| return df[all_features] | |
| def prepare_data_for_modeling(self, df): | |
| """Prepare data for machine learning""" | |
| # Create features | |
| df_features = self.create_features(df) | |
| # Encode categorical features | |
| df_encoded = self.encode_categorical_features(df_features) | |
| # Select features | |
| X = self.select_features(df_encoded) | |
| y = df_encoded['IS_DELAYED'] | |
| # Handle any remaining missing values | |
| X = X.fillna(X.mean()) | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| # Scale features | |
| X_train_scaled = self.scaler.fit_transform(X_train) | |
| X_test_scaled = self.scaler.transform(X_test) | |
| return X_train_scaled, X_test_scaled, y_train, y_test, X.columns.tolist() | |
| def get_feature_importance_data(self, df): | |
| """Get data for feature importance analysis""" | |
| df_features = self.create_features(df) | |
| df_encoded = self.encode_categorical_features(df_features) | |
| X = self.select_features(df_encoded) | |
| return X, df_encoded['IS_DELAYED'] | |