Zayeemk's picture
Rename utilis/features.py to utils/features.py
9d2b93f verified
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
class FeatureEngineer:
def __init__(self):
self.scaler = StandardScaler()
self.label_encoders = {}
self.feature_columns = []
def create_features(self, df):
"""Create engineered features from the dataset"""
df_features = df.copy()
# Time-based features
df_features['IS_WEEKEND'] = (df_features['DAY_OF_WEEK'] >= 6).astype(int)
df_features['IS_MORNING_RUSH'] = ((df_features['DEPARTURE_HOUR'] >= 6) &
(df_features['DEPARTURE_HOUR'] <= 9)).astype(int)
df_features['IS_EVENING_RUSH'] = ((df_features['DEPARTURE_HOUR'] >= 17) &
(df_features['DEPARTURE_HOUR'] <= 20)).astype(int)
df_features['IS_NIGHT'] = ((df_features['DEPARTURE_HOUR'] >= 22) |
(df_features['DEPARTURE_HOUR'] <= 5)).astype(int)
# Weather interaction features
df_features['BAD_WEATHER'] = df_features['WEATHER_CATEGORY'].apply(
lambda x: 1 if x in ['rain', 'snow', 'storm'] else 0
)
# Distance categories
df_features['DISTANCE_CATEGORY'] = pd.cut(df_features['DISTANCE'],
bins=[0, 500, 1500, 3000, np.inf],
labels=['Short', 'Medium', 'Long', 'Very Long'])
# Airline popularity (route frequency)
route_counts = df_features.groupby(['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']).size()
df_features['ROUTE_FREQUENCY'] = df_features.set_index(['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']).index.map(route_counts)
df_features['ROUTE_FREQUENCY'] = df_features['ROUTE_FREQUENCY'].fillna(1)
# Airport busyness
origin_counts = df_features['ORIGIN_AIRPORT'].value_counts()
dest_counts = df_features['DESTINATION_AIRPORT'].value_counts()
df_features['ORIGIN_BUSYNESS'] = df_features['ORIGIN_AIRPORT'].map(origin_counts)
df_features['DESTINATION_BUSYNESS'] = df_features['DESTINATION_AIRPORT'].map(dest_counts)
# Weather severity score
df_features['WEATHER_SEVERITY'] = df_features['WEATHER_CATEGORY'].map({
'clear': 0, 'clouds': 1, 'rain': 2, 'snow': 3, 'storm': 4, 'other': 1
}).fillna(1)
# Temperature categories
df_features['TEMP_CATEGORY'] = pd.cut(df_features['TEMP_C'],
bins=[-np.inf, 0, 15, 25, np.inf],
labels=['Freezing', 'Cold', 'Mild', 'Hot'])
# Wind speed categories
df_features['WIND_CATEGORY'] = pd.cut(df_features['WIND_SPEED'],
bins=[0, 5, 15, 25, np.inf],
labels=['Calm', 'Moderate', 'Strong', 'Severe'])
return df_features
def encode_categorical_features(self, df):
"""Encode categorical features"""
df_encoded = df.copy()
# Identify categorical columns
categorical_columns = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
'SEASON', 'WEATHER_CATEGORY', 'DISTANCE_CATEGORY',
'TEMP_CATEGORY', 'WIND_CATEGORY']
# Add categorical columns that exist in dataframe
existing_categorical = [col for col in categorical_columns if col in df_encoded.columns]
for col in existing_categorical:
if col not in self.label_encoders:
self.label_encoders[col] = LabelEncoder()
df_encoded[col + '_ENCODED'] = self.label_encoders[col].fit_transform(df_encoded[col].astype(str))
else:
# Handle unseen labels by mapping them to a default value
unique_values = set(df_encoded[col].astype(str).unique())
known_values = set(self.label_encoders[col].classes_)
unseen_values = unique_values - known_values
if unseen_values:
# Create a mapping for unseen values to 0 (or any default)
temp_series = df_encoded[col].astype(str).copy()
for unseen in unseen_values:
temp_series[temp_series == unseen] = self.label_encoders[col].classes_[0] # Map to first known class
df_encoded[col + '_ENCODED'] = self.label_encoders[col].transform(temp_series)
else:
df_encoded[col + '_ENCODED'] = self.label_encoders[col].transform(df_encoded[col].astype(str))
return df_encoded
def select_features(self, df):
"""Select final features for modeling"""
# Define feature columns
numeric_features = [
'MONTH', 'DAY_OF_WEEK', 'DEPARTURE_HOUR', 'DISTANCE',
'TEMP_C', 'HUMIDITY', 'WIND_SPEED', 'ROUTE_FREQUENCY',
'ORIGIN_BUSYNESS', 'DESTINATION_BUSYNESS', 'WEATHER_SEVERITY'
]
binary_features = [
'IS_WEEKEND', 'IS_MORNING_RUSH', 'IS_EVENING_RUSH',
'IS_NIGHT', 'BAD_WEATHER'
]
# Encoded categorical features
encoded_categorical = [col for col in df.columns if col.endswith('_ENCODED')]
# Combine all features
all_features = []
for feature_list in [numeric_features, binary_features, encoded_categorical]:
all_features.extend([col for col in feature_list if col in df.columns])
self.feature_columns = all_features
return df[all_features]
def prepare_data_for_modeling(self, df):
"""Prepare data for machine learning"""
# Create features
df_features = self.create_features(df)
# Encode categorical features
df_encoded = self.encode_categorical_features(df_features)
# Select features
X = self.select_features(df_encoded)
y = df_encoded['IS_DELAYED']
# Handle any remaining missing values
X = X.fillna(X.mean())
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Scale features
X_train_scaled = self.scaler.fit_transform(X_train)
X_test_scaled = self.scaler.transform(X_test)
return X_train_scaled, X_test_scaled, y_train, y_test, X.columns.tolist()
def get_feature_importance_data(self, df):
"""Get data for feature importance analysis"""
df_features = self.create_features(df)
df_encoded = self.encode_categorical_features(df_features)
X = self.select_features(df_encoded)
return X, df_encoded['IS_DELAYED']