Flight-delay-Prediction / utils /preprocess.py
Zayeemk's picture
Rename utilis/preprocess.py to utils/preprocess.py
bfdae47 verified
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
class FlightDataPreprocessor:
def __init__(self):
self.airlines_df = None
self.airports_df = None
self.label_encoders = {}
def load_data(self):
"""Load all datasets"""
try:
self.airlines_df = pd.read_csv('data/airlines.csv')
self.airports_df = pd.read_csv('data/airports.csv')
self.flights_df = pd.read_csv('data/flights.csv', low_memory=False)
self.weather_df = pd.read_csv('data/weather.csv')
# Load Indian data if available
try:
indian_airports = pd.read_csv('data/indian_airports.csv')
self.airports_df = pd.concat([self.airports_df, indian_airports], ignore_index=True)
except:
pass
try:
indian_airlines = pd.read_csv('data/indian_airlines.csv')
self.airlines_df = pd.concat([self.airlines_df, indian_airlines], ignore_index=True)
except:
pass
return True
except Exception as e:
print(f"Error loading data: {e}")
return False
def clean_airlines_data(self):
"""Clean airlines data to get proper mapping"""
# Extract airline codes and names
airlines_mapping = self.airlines_df[['IATA_CODE', 'AIRLINE']].dropna()
airlines_mapping = airlines_mapping[airlines_mapping['IATA_CODE'].notna()]
airlines_mapping = airlines_mapping[airlines_mapping['AIRLINE'].notna()]
self.airlines_mapping = dict(zip(airlines_mapping['IATA_CODE'], airlines_mapping['AIRLINE']))
return self.airlines_mapping
def clean_airports_data(self):
"""Clean airports data to get proper mapping"""
# Extract airport codes and names
airports_mapping = self.airports_df[['IATA_CODE', 'AIRPORT']].dropna()
airports_mapping = airports_mapping[airports_mapping['IATA_CODE'].notna()]
airports_mapping = airports_mapping[airports_mapping['AIRPORT'].notna()]
self.airports_mapping = dict(zip(airports_mapping['IATA_CODE'], airports_mapping['AIRPORT']))
return self.airports_mapping
def preprocess_flights_data(self):
"""Preprocess flights dataset"""
df = self.flights_df.copy()
# Select relevant columns
relevant_cols = ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE',
'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'SCHEDULED_ARRIVAL',
'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'DISTANCE', 'CANCELLED']
df = df[relevant_cols].copy()
# Remove cancelled flights
df = df[df['CANCELLED'] == 0]
# Create target variable (delayed if arrival delay > 15 minutes)
df['IS_DELAYED'] = (df['ARRIVAL_DELAY'] > 15).astype(int)
# Fill missing values
df['DEPARTURE_DELAY'] = df['DEPARTURE_DELAY'].fillna(0)
df['ARRIVAL_DELAY'] = df['ARRIVAL_DELAY'].fillna(0)
# Convert scheduled departure to hour
df['DEPARTURE_HOUR'] = df['SCHEDULED_DEPARTURE'] // 100
df['DEPARTURE_HOUR'] = df['DEPARTURE_HOUR'].replace(24, 0) # Handle midnight
# Create season
df['SEASON'] = df['MONTH'].apply(self._get_season)
# Map airline codes to full names
if hasattr(self, 'airlines_mapping'):
df['AIRLINE_NAME'] = df['AIRLINE'].map(self.airlines_mapping).fillna(df['AIRLINE'])
# Map airport codes to full names
if hasattr(self, 'airports_mapping'):
df['ORIGIN_AIRPORT_NAME'] = df['ORIGIN_AIRPORT'].map(self.airports_mapping).fillna(df['ORIGIN_AIRPORT'])
df['DESTINATION_AIRPORT_NAME'] = df['DESTINATION_AIRPORT'].map(self.airports_mapping).fillna(df['DESTINATION_AIRPORT'])
return df
def preprocess_weather_data(self):
"""Preprocess weather dataset"""
df = self.weather_df.copy()
# Convert datetime
df['datetime'] = pd.to_datetime(df['datetime'])
df['DATE'] = df['datetime'].dt.date
df['HOUR'] = df['datetime'].dt.hour
# Convert temperature from Kelvin to Celsius
df['TEMP_C'] = df['temperature'] - 273.15
# Create weather categories
df['WEATHER_CATEGORY'] = df['weather_description'].apply(self._categorize_weather)
# Aggregate weather by city and date-hour
weather_agg = df.groupby(['city', 'DATE', 'HOUR']).agg({
'TEMP_C': 'mean',
'humidity': 'mean',
'pressure': 'mean',
'wind_speed': 'mean',
'WEATHER_CATEGORY': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'clear'
}).reset_index()
return weather_agg
def merge_flight_weather(self, flights_df, weather_df):
"""Merge flight and weather data"""
# Create date column for flights
flights_df['DATE'] = pd.to_datetime(flights_df[['YEAR', 'MONTH', 'DAY']]).dt.date
# Get weather for origin airport
# Note: This is a simplified approach. In practice, you'd need airport-city mapping
merged_df = flights_df.copy()
# Add sample weather data (in real implementation, you'd match by nearest city)
# For now, we'll add random weather features for demonstration
np.random.seed(42)
merged_df['TEMP_C'] = np.random.normal(15, 10, len(merged_df))
merged_df['HUMIDITY'] = np.random.normal(60, 20, len(merged_df))
merged_df['WIND_SPEED'] = np.random.normal(10, 5, len(merged_df))
merged_df['WEATHER_CATEGORY'] = np.random.choice(['clear', 'clouds', 'rain', 'snow'], len(merged_df))
return merged_df
def _get_season(self, month):
"""Get season from month"""
if month in [12, 1, 2]:
return 'Winter'
elif month in [3, 4, 5]:
return 'Spring'
elif month in [6, 7, 8]:
return 'Summer'
else:
return 'Fall'
def _categorize_weather(self, description):
"""Categorize weather description"""
description = str(description).lower()
if 'clear' in description or 'sunny' in description:
return 'clear'
elif 'cloud' in description:
return 'clouds'
elif 'rain' in description or 'drizzle' in description:
return 'rain'
elif 'snow' in description:
return 'snow'
elif 'storm' in description or 'thunder' in description:
return 'storm'
else:
return 'other'
def get_processed_data(self):
"""Get fully processed data ready for modeling"""
if not self.load_data():
return None, None, None, None
# Clean mappings
self.clean_airlines_data()
self.clean_airports_data()
# Preprocess datasets
flights_processed = self.preprocess_flights_data()
weather_processed = self.preprocess_weather_data()
# Merge datasets
final_data = self.merge_flight_weather(flights_processed, weather_processed)
return final_data, self.airlines_mapping, self.airports_mapping, weather_processed