Spaces:
Sleeping
Sleeping
File size: 7,285 Bytes
a13b550 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
class FeatureEngineer:
def __init__(self):
self.scaler = StandardScaler()
self.label_encoders = {}
self.feature_columns = []
def create_features(self, df):
"""Create engineered features from the dataset"""
df_features = df.copy()
# Time-based features
df_features['IS_WEEKEND'] = (df_features['DAY_OF_WEEK'] >= 6).astype(int)
df_features['IS_MORNING_RUSH'] = ((df_features['DEPARTURE_HOUR'] >= 6) &
(df_features['DEPARTURE_HOUR'] <= 9)).astype(int)
df_features['IS_EVENING_RUSH'] = ((df_features['DEPARTURE_HOUR'] >= 17) &
(df_features['DEPARTURE_HOUR'] <= 20)).astype(int)
df_features['IS_NIGHT'] = ((df_features['DEPARTURE_HOUR'] >= 22) |
(df_features['DEPARTURE_HOUR'] <= 5)).astype(int)
# Weather interaction features
df_features['BAD_WEATHER'] = df_features['WEATHER_CATEGORY'].apply(
lambda x: 1 if x in ['rain', 'snow', 'storm'] else 0
)
# Distance categories
df_features['DISTANCE_CATEGORY'] = pd.cut(df_features['DISTANCE'],
bins=[0, 500, 1500, 3000, np.inf],
labels=['Short', 'Medium', 'Long', 'Very Long'])
# Airline popularity (route frequency)
route_counts = df_features.groupby(['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']).size()
df_features['ROUTE_FREQUENCY'] = df_features.set_index(['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']).index.map(route_counts)
df_features['ROUTE_FREQUENCY'] = df_features['ROUTE_FREQUENCY'].fillna(1)
# Airport busyness
origin_counts = df_features['ORIGIN_AIRPORT'].value_counts()
dest_counts = df_features['DESTINATION_AIRPORT'].value_counts()
df_features['ORIGIN_BUSYNESS'] = df_features['ORIGIN_AIRPORT'].map(origin_counts)
df_features['DESTINATION_BUSYNESS'] = df_features['DESTINATION_AIRPORT'].map(dest_counts)
# Weather severity score
df_features['WEATHER_SEVERITY'] = df_features['WEATHER_CATEGORY'].map({
'clear': 0, 'clouds': 1, 'rain': 2, 'snow': 3, 'storm': 4, 'other': 1
}).fillna(1)
# Temperature categories
df_features['TEMP_CATEGORY'] = pd.cut(df_features['TEMP_C'],
bins=[-np.inf, 0, 15, 25, np.inf],
labels=['Freezing', 'Cold', 'Mild', 'Hot'])
# Wind speed categories
df_features['WIND_CATEGORY'] = pd.cut(df_features['WIND_SPEED'],
bins=[0, 5, 15, 25, np.inf],
labels=['Calm', 'Moderate', 'Strong', 'Severe'])
return df_features
def encode_categorical_features(self, df):
"""Encode categorical features"""
df_encoded = df.copy()
# Identify categorical columns
categorical_columns = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
'SEASON', 'WEATHER_CATEGORY', 'DISTANCE_CATEGORY',
'TEMP_CATEGORY', 'WIND_CATEGORY']
# Add categorical columns that exist in dataframe
existing_categorical = [col for col in categorical_columns if col in df_encoded.columns]
for col in existing_categorical:
if col not in self.label_encoders:
self.label_encoders[col] = LabelEncoder()
df_encoded[col + '_ENCODED'] = self.label_encoders[col].fit_transform(df_encoded[col].astype(str))
else:
# Handle unseen labels by mapping them to a default value
unique_values = set(df_encoded[col].astype(str).unique())
known_values = set(self.label_encoders[col].classes_)
unseen_values = unique_values - known_values
if unseen_values:
# Create a mapping for unseen values to 0 (or any default)
temp_series = df_encoded[col].astype(str).copy()
for unseen in unseen_values:
temp_series[temp_series == unseen] = self.label_encoders[col].classes_[0] # Map to first known class
df_encoded[col + '_ENCODED'] = self.label_encoders[col].transform(temp_series)
else:
df_encoded[col + '_ENCODED'] = self.label_encoders[col].transform(df_encoded[col].astype(str))
return df_encoded
def select_features(self, df):
"""Select final features for modeling"""
# Define feature columns
numeric_features = [
'MONTH', 'DAY_OF_WEEK', 'DEPARTURE_HOUR', 'DISTANCE',
'TEMP_C', 'HUMIDITY', 'WIND_SPEED', 'ROUTE_FREQUENCY',
'ORIGIN_BUSYNESS', 'DESTINATION_BUSYNESS', 'WEATHER_SEVERITY'
]
binary_features = [
'IS_WEEKEND', 'IS_MORNING_RUSH', 'IS_EVENING_RUSH',
'IS_NIGHT', 'BAD_WEATHER'
]
# Encoded categorical features
encoded_categorical = [col for col in df.columns if col.endswith('_ENCODED')]
# Combine all features
all_features = []
for feature_list in [numeric_features, binary_features, encoded_categorical]:
all_features.extend([col for col in feature_list if col in df.columns])
self.feature_columns = all_features
return df[all_features]
def prepare_data_for_modeling(self, df):
"""Prepare data for machine learning"""
# Create features
df_features = self.create_features(df)
# Encode categorical features
df_encoded = self.encode_categorical_features(df_features)
# Select features
X = self.select_features(df_encoded)
y = df_encoded['IS_DELAYED']
# Handle any remaining missing values
X = X.fillna(X.mean())
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Scale features
X_train_scaled = self.scaler.fit_transform(X_train)
X_test_scaled = self.scaler.transform(X_test)
return X_train_scaled, X_test_scaled, y_train, y_test, X.columns.tolist()
def get_feature_importance_data(self, df):
"""Get data for feature importance analysis"""
df_features = self.create_features(df)
df_encoded = self.encode_categorical_features(df_features)
X = self.select_features(df_encoded)
return X, df_encoded['IS_DELAYED']
|