import pandas as pd import numpy as np from datetime import datetime import warnings warnings.filterwarnings('ignore') class FlightDataPreprocessor: def __init__(self): self.airlines_df = None self.airports_df = None self.label_encoders = {} def load_data(self): """Load all datasets""" try: self.airlines_df = pd.read_csv('data/airlines.csv') self.airports_df = pd.read_csv('data/airports.csv') self.flights_df = pd.read_csv('data/flights.csv', low_memory=False) self.weather_df = pd.read_csv('data/weather.csv') # Load Indian data if available try: indian_airports = pd.read_csv('data/indian_airports.csv') self.airports_df = pd.concat([self.airports_df, indian_airports], ignore_index=True) except: pass try: indian_airlines = pd.read_csv('data/indian_airlines.csv') self.airlines_df = pd.concat([self.airlines_df, indian_airlines], ignore_index=True) except: pass return True except Exception as e: print(f"Error loading data: {e}") return False def clean_airlines_data(self): """Clean airlines data to get proper mapping""" # Extract airline codes and names airlines_mapping = self.airlines_df[['IATA_CODE', 'AIRLINE']].dropna() airlines_mapping = airlines_mapping[airlines_mapping['IATA_CODE'].notna()] airlines_mapping = airlines_mapping[airlines_mapping['AIRLINE'].notna()] self.airlines_mapping = dict(zip(airlines_mapping['IATA_CODE'], airlines_mapping['AIRLINE'])) return self.airlines_mapping def clean_airports_data(self): """Clean airports data to get proper mapping""" # Extract airport codes and names airports_mapping = self.airports_df[['IATA_CODE', 'AIRPORT']].dropna() airports_mapping = airports_mapping[airports_mapping['IATA_CODE'].notna()] airports_mapping = airports_mapping[airports_mapping['AIRPORT'].notna()] self.airports_mapping = dict(zip(airports_mapping['IATA_CODE'], airports_mapping['AIRPORT'])) return self.airports_mapping def preprocess_flights_data(self): """Preprocess flights dataset""" df = self.flights_df.copy() # Select relevant columns relevant_cols = ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'DISTANCE', 'CANCELLED'] df = df[relevant_cols].copy() # Remove cancelled flights df = df[df['CANCELLED'] == 0] # Create target variable (delayed if arrival delay > 15 minutes) df['IS_DELAYED'] = (df['ARRIVAL_DELAY'] > 15).astype(int) # Fill missing values df['DEPARTURE_DELAY'] = df['DEPARTURE_DELAY'].fillna(0) df['ARRIVAL_DELAY'] = df['ARRIVAL_DELAY'].fillna(0) # Convert scheduled departure to hour df['DEPARTURE_HOUR'] = df['SCHEDULED_DEPARTURE'] // 100 df['DEPARTURE_HOUR'] = df['DEPARTURE_HOUR'].replace(24, 0) # Handle midnight # Create season df['SEASON'] = df['MONTH'].apply(self._get_season) # Map airline codes to full names if hasattr(self, 'airlines_mapping'): df['AIRLINE_NAME'] = df['AIRLINE'].map(self.airlines_mapping).fillna(df['AIRLINE']) # Map airport codes to full names if hasattr(self, 'airports_mapping'): df['ORIGIN_AIRPORT_NAME'] = df['ORIGIN_AIRPORT'].map(self.airports_mapping).fillna(df['ORIGIN_AIRPORT']) df['DESTINATION_AIRPORT_NAME'] = df['DESTINATION_AIRPORT'].map(self.airports_mapping).fillna(df['DESTINATION_AIRPORT']) return df def preprocess_weather_data(self): """Preprocess weather dataset""" df = self.weather_df.copy() # Convert datetime df['datetime'] = pd.to_datetime(df['datetime']) df['DATE'] = df['datetime'].dt.date df['HOUR'] = df['datetime'].dt.hour # Convert temperature from Kelvin to Celsius df['TEMP_C'] = df['temperature'] - 273.15 # Create weather categories df['WEATHER_CATEGORY'] = df['weather_description'].apply(self._categorize_weather) # Aggregate weather by city and date-hour weather_agg = df.groupby(['city', 'DATE', 'HOUR']).agg({ 'TEMP_C': 'mean', 'humidity': 'mean', 'pressure': 'mean', 'wind_speed': 'mean', 'WEATHER_CATEGORY': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'clear' }).reset_index() return weather_agg def merge_flight_weather(self, flights_df, weather_df): """Merge flight and weather data""" # Create date column for flights flights_df['DATE'] = pd.to_datetime(flights_df[['YEAR', 'MONTH', 'DAY']]).dt.date # Get weather for origin airport # Note: This is a simplified approach. In practice, you'd need airport-city mapping merged_df = flights_df.copy() # Add sample weather data (in real implementation, you'd match by nearest city) # For now, we'll add random weather features for demonstration np.random.seed(42) merged_df['TEMP_C'] = np.random.normal(15, 10, len(merged_df)) merged_df['HUMIDITY'] = np.random.normal(60, 20, len(merged_df)) merged_df['WIND_SPEED'] = np.random.normal(10, 5, len(merged_df)) merged_df['WEATHER_CATEGORY'] = np.random.choice(['clear', 'clouds', 'rain', 'snow'], len(merged_df)) return merged_df def _get_season(self, month): """Get season from month""" if month in [12, 1, 2]: return 'Winter' elif month in [3, 4, 5]: return 'Spring' elif month in [6, 7, 8]: return 'Summer' else: return 'Fall' def _categorize_weather(self, description): """Categorize weather description""" description = str(description).lower() if 'clear' in description or 'sunny' in description: return 'clear' elif 'cloud' in description: return 'clouds' elif 'rain' in description or 'drizzle' in description: return 'rain' elif 'snow' in description: return 'snow' elif 'storm' in description or 'thunder' in description: return 'storm' else: return 'other' def get_processed_data(self): """Get fully processed data ready for modeling""" if not self.load_data(): return None, None, None, None # Clean mappings self.clean_airlines_data() self.clean_airports_data() # Preprocess datasets flights_processed = self.preprocess_flights_data() weather_processed = self.preprocess_weather_data() # Merge datasets final_data = self.merge_flight_weather(flights_processed, weather_processed) return final_data, self.airlines_mapping, self.airports_mapping, weather_processed