Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| class FlightDataPreprocessor: | |
| def __init__(self): | |
| self.airlines_df = None | |
| self.airports_df = None | |
| self.label_encoders = {} | |
| def load_data(self): | |
| """Load all datasets""" | |
| try: | |
| self.airlines_df = pd.read_csv('data/airlines.csv') | |
| self.airports_df = pd.read_csv('data/airports.csv') | |
| self.flights_df = pd.read_csv('data/flights.csv', low_memory=False) | |
| self.weather_df = pd.read_csv('data/weather.csv') | |
| # Load Indian data if available | |
| try: | |
| indian_airports = pd.read_csv('data/indian_airports.csv') | |
| self.airports_df = pd.concat([self.airports_df, indian_airports], ignore_index=True) | |
| except: | |
| pass | |
| try: | |
| indian_airlines = pd.read_csv('data/indian_airlines.csv') | |
| self.airlines_df = pd.concat([self.airlines_df, indian_airlines], ignore_index=True) | |
| except: | |
| pass | |
| return True | |
| except Exception as e: | |
| print(f"Error loading data: {e}") | |
| return False | |
| def clean_airlines_data(self): | |
| """Clean airlines data to get proper mapping""" | |
| # Extract airline codes and names | |
| airlines_mapping = self.airlines_df[['IATA_CODE', 'AIRLINE']].dropna() | |
| airlines_mapping = airlines_mapping[airlines_mapping['IATA_CODE'].notna()] | |
| airlines_mapping = airlines_mapping[airlines_mapping['AIRLINE'].notna()] | |
| self.airlines_mapping = dict(zip(airlines_mapping['IATA_CODE'], airlines_mapping['AIRLINE'])) | |
| return self.airlines_mapping | |
| def clean_airports_data(self): | |
| """Clean airports data to get proper mapping""" | |
| # Extract airport codes and names | |
| airports_mapping = self.airports_df[['IATA_CODE', 'AIRPORT']].dropna() | |
| airports_mapping = airports_mapping[airports_mapping['IATA_CODE'].notna()] | |
| airports_mapping = airports_mapping[airports_mapping['AIRPORT'].notna()] | |
| self.airports_mapping = dict(zip(airports_mapping['IATA_CODE'], airports_mapping['AIRPORT'])) | |
| return self.airports_mapping | |
| def preprocess_flights_data(self): | |
| """Preprocess flights dataset""" | |
| df = self.flights_df.copy() | |
| # Select relevant columns | |
| relevant_cols = ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', | |
| 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', | |
| 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'SCHEDULED_ARRIVAL', | |
| 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'DISTANCE', 'CANCELLED'] | |
| df = df[relevant_cols].copy() | |
| # Remove cancelled flights | |
| df = df[df['CANCELLED'] == 0] | |
| # Create target variable (delayed if arrival delay > 15 minutes) | |
| df['IS_DELAYED'] = (df['ARRIVAL_DELAY'] > 15).astype(int) | |
| # Fill missing values | |
| df['DEPARTURE_DELAY'] = df['DEPARTURE_DELAY'].fillna(0) | |
| df['ARRIVAL_DELAY'] = df['ARRIVAL_DELAY'].fillna(0) | |
| # Convert scheduled departure to hour | |
| df['DEPARTURE_HOUR'] = df['SCHEDULED_DEPARTURE'] // 100 | |
| df['DEPARTURE_HOUR'] = df['DEPARTURE_HOUR'].replace(24, 0) # Handle midnight | |
| # Create season | |
| df['SEASON'] = df['MONTH'].apply(self._get_season) | |
| # Map airline codes to full names | |
| if hasattr(self, 'airlines_mapping'): | |
| df['AIRLINE_NAME'] = df['AIRLINE'].map(self.airlines_mapping).fillna(df['AIRLINE']) | |
| # Map airport codes to full names | |
| if hasattr(self, 'airports_mapping'): | |
| df['ORIGIN_AIRPORT_NAME'] = df['ORIGIN_AIRPORT'].map(self.airports_mapping).fillna(df['ORIGIN_AIRPORT']) | |
| df['DESTINATION_AIRPORT_NAME'] = df['DESTINATION_AIRPORT'].map(self.airports_mapping).fillna(df['DESTINATION_AIRPORT']) | |
| return df | |
| def preprocess_weather_data(self): | |
| """Preprocess weather dataset""" | |
| df = self.weather_df.copy() | |
| # Convert datetime | |
| df['datetime'] = pd.to_datetime(df['datetime']) | |
| df['DATE'] = df['datetime'].dt.date | |
| df['HOUR'] = df['datetime'].dt.hour | |
| # Convert temperature from Kelvin to Celsius | |
| df['TEMP_C'] = df['temperature'] - 273.15 | |
| # Create weather categories | |
| df['WEATHER_CATEGORY'] = df['weather_description'].apply(self._categorize_weather) | |
| # Aggregate weather by city and date-hour | |
| weather_agg = df.groupby(['city', 'DATE', 'HOUR']).agg({ | |
| 'TEMP_C': 'mean', | |
| 'humidity': 'mean', | |
| 'pressure': 'mean', | |
| 'wind_speed': 'mean', | |
| 'WEATHER_CATEGORY': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'clear' | |
| }).reset_index() | |
| return weather_agg | |
| def merge_flight_weather(self, flights_df, weather_df): | |
| """Merge flight and weather data""" | |
| # Create date column for flights | |
| flights_df['DATE'] = pd.to_datetime(flights_df[['YEAR', 'MONTH', 'DAY']]).dt.date | |
| # Get weather for origin airport | |
| # Note: This is a simplified approach. In practice, you'd need airport-city mapping | |
| merged_df = flights_df.copy() | |
| # Add sample weather data (in real implementation, you'd match by nearest city) | |
| # For now, we'll add random weather features for demonstration | |
| np.random.seed(42) | |
| merged_df['TEMP_C'] = np.random.normal(15, 10, len(merged_df)) | |
| merged_df['HUMIDITY'] = np.random.normal(60, 20, len(merged_df)) | |
| merged_df['WIND_SPEED'] = np.random.normal(10, 5, len(merged_df)) | |
| merged_df['WEATHER_CATEGORY'] = np.random.choice(['clear', 'clouds', 'rain', 'snow'], len(merged_df)) | |
| return merged_df | |
| def _get_season(self, month): | |
| """Get season from month""" | |
| if month in [12, 1, 2]: | |
| return 'Winter' | |
| elif month in [3, 4, 5]: | |
| return 'Spring' | |
| elif month in [6, 7, 8]: | |
| return 'Summer' | |
| else: | |
| return 'Fall' | |
| def _categorize_weather(self, description): | |
| """Categorize weather description""" | |
| description = str(description).lower() | |
| if 'clear' in description or 'sunny' in description: | |
| return 'clear' | |
| elif 'cloud' in description: | |
| return 'clouds' | |
| elif 'rain' in description or 'drizzle' in description: | |
| return 'rain' | |
| elif 'snow' in description: | |
| return 'snow' | |
| elif 'storm' in description or 'thunder' in description: | |
| return 'storm' | |
| else: | |
| return 'other' | |
| def get_processed_data(self): | |
| """Get fully processed data ready for modeling""" | |
| if not self.load_data(): | |
| return None, None, None, None | |
| # Clean mappings | |
| self.clean_airlines_data() | |
| self.clean_airports_data() | |
| # Preprocess datasets | |
| flights_processed = self.preprocess_flights_data() | |
| weather_processed = self.preprocess_weather_data() | |
| # Merge datasets | |
| final_data = self.merge_flight_weather(flights_processed, weather_processed) | |
| return final_data, self.airlines_mapping, self.airports_mapping, weather_processed | |