File size: 7,798 Bytes
a13b550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class FlightDataPreprocessor:
    def __init__(self):
        self.airlines_df = None
        self.airports_df = None
        self.label_encoders = {}
        
    def load_data(self):
        """Load all datasets"""
        try:
            self.airlines_df = pd.read_csv('data/airlines.csv')
            self.airports_df = pd.read_csv('data/airports.csv')
            self.flights_df = pd.read_csv('data/flights.csv', low_memory=False)
            self.weather_df = pd.read_csv('data/weather.csv')
            
            # Load Indian data if available
            try:
                indian_airports = pd.read_csv('data/indian_airports.csv')
                self.airports_df = pd.concat([self.airports_df, indian_airports], ignore_index=True)
            except:
                pass
                
            try:
                indian_airlines = pd.read_csv('data/indian_airlines.csv')
                self.airlines_df = pd.concat([self.airlines_df, indian_airlines], ignore_index=True)
            except:
                pass
                
            return True
        except Exception as e:
            print(f"Error loading data: {e}")
            return False
    
    def clean_airlines_data(self):
        """Clean airlines data to get proper mapping"""
        # Extract airline codes and names
        airlines_mapping = self.airlines_df[['IATA_CODE', 'AIRLINE']].dropna()
        airlines_mapping = airlines_mapping[airlines_mapping['IATA_CODE'].notna()]
        airlines_mapping = airlines_mapping[airlines_mapping['AIRLINE'].notna()]
        self.airlines_mapping = dict(zip(airlines_mapping['IATA_CODE'], airlines_mapping['AIRLINE']))
        return self.airlines_mapping
    
    def clean_airports_data(self):
        """Clean airports data to get proper mapping"""
        # Extract airport codes and names
        airports_mapping = self.airports_df[['IATA_CODE', 'AIRPORT']].dropna()
        airports_mapping = airports_mapping[airports_mapping['IATA_CODE'].notna()]
        airports_mapping = airports_mapping[airports_mapping['AIRPORT'].notna()]
        self.airports_mapping = dict(zip(airports_mapping['IATA_CODE'], airports_mapping['AIRPORT']))
        return self.airports_mapping
    
    def preprocess_flights_data(self):
        """Preprocess flights dataset"""
        df = self.flights_df.copy()
        
        # Select relevant columns
        relevant_cols = ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
                        'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE',
                        'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'SCHEDULED_ARRIVAL',
                        'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'DISTANCE', 'CANCELLED']
        
        df = df[relevant_cols].copy()
        
        # Remove cancelled flights
        df = df[df['CANCELLED'] == 0]
        
        # Create target variable (delayed if arrival delay > 15 minutes)
        df['IS_DELAYED'] = (df['ARRIVAL_DELAY'] > 15).astype(int)
        
        # Fill missing values
        df['DEPARTURE_DELAY'] = df['DEPARTURE_DELAY'].fillna(0)
        df['ARRIVAL_DELAY'] = df['ARRIVAL_DELAY'].fillna(0)
        
        # Convert scheduled departure to hour
        df['DEPARTURE_HOUR'] = df['SCHEDULED_DEPARTURE'] // 100
        df['DEPARTURE_HOUR'] = df['DEPARTURE_HOUR'].replace(24, 0)  # Handle midnight
        
        # Create season
        df['SEASON'] = df['MONTH'].apply(self._get_season)
        
        # Map airline codes to full names
        if hasattr(self, 'airlines_mapping'):
            df['AIRLINE_NAME'] = df['AIRLINE'].map(self.airlines_mapping).fillna(df['AIRLINE'])
        
        # Map airport codes to full names
        if hasattr(self, 'airports_mapping'):
            df['ORIGIN_AIRPORT_NAME'] = df['ORIGIN_AIRPORT'].map(self.airports_mapping).fillna(df['ORIGIN_AIRPORT'])
            df['DESTINATION_AIRPORT_NAME'] = df['DESTINATION_AIRPORT'].map(self.airports_mapping).fillna(df['DESTINATION_AIRPORT'])
        
        return df
    
    def preprocess_weather_data(self):
        """Preprocess weather dataset"""
        df = self.weather_df.copy()
        
        # Convert datetime
        df['datetime'] = pd.to_datetime(df['datetime'])
        df['DATE'] = df['datetime'].dt.date
        df['HOUR'] = df['datetime'].dt.hour
        
        # Convert temperature from Kelvin to Celsius
        df['TEMP_C'] = df['temperature'] - 273.15
        
        # Create weather categories
        df['WEATHER_CATEGORY'] = df['weather_description'].apply(self._categorize_weather)
        
        # Aggregate weather by city and date-hour
        weather_agg = df.groupby(['city', 'DATE', 'HOUR']).agg({
            'TEMP_C': 'mean',
            'humidity': 'mean',
            'pressure': 'mean',
            'wind_speed': 'mean',
            'WEATHER_CATEGORY': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'clear'
        }).reset_index()
        
        return weather_agg
    
    def merge_flight_weather(self, flights_df, weather_df):
        """Merge flight and weather data"""
        # Create date column for flights
        flights_df['DATE'] = pd.to_datetime(flights_df[['YEAR', 'MONTH', 'DAY']]).dt.date
        
        # Get weather for origin airport
        # Note: This is a simplified approach. In practice, you'd need airport-city mapping
        merged_df = flights_df.copy()
        
        # Add sample weather data (in real implementation, you'd match by nearest city)
        # For now, we'll add random weather features for demonstration
        np.random.seed(42)
        merged_df['TEMP_C'] = np.random.normal(15, 10, len(merged_df))
        merged_df['HUMIDITY'] = np.random.normal(60, 20, len(merged_df))
        merged_df['WIND_SPEED'] = np.random.normal(10, 5, len(merged_df))
        merged_df['WEATHER_CATEGORY'] = np.random.choice(['clear', 'clouds', 'rain', 'snow'], len(merged_df))
        
        return merged_df
    
    def _get_season(self, month):
        """Get season from month"""
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Fall'
    
    def _categorize_weather(self, description):
        """Categorize weather description"""
        description = str(description).lower()
        if 'clear' in description or 'sunny' in description:
            return 'clear'
        elif 'cloud' in description:
            return 'clouds'
        elif 'rain' in description or 'drizzle' in description:
            return 'rain'
        elif 'snow' in description:
            return 'snow'
        elif 'storm' in description or 'thunder' in description:
            return 'storm'
        else:
            return 'other'
    
    def get_processed_data(self):
        """Get fully processed data ready for modeling"""
        if not self.load_data():
            return None, None, None, None
        
        # Clean mappings
        self.clean_airlines_data()
        self.clean_airports_data()
        
        # Preprocess datasets
        flights_processed = self.preprocess_flights_data()
        weather_processed = self.preprocess_weather_data()
        
        # Merge datasets
        final_data = self.merge_flight_weather(flights_processed, weather_processed)
        
        return final_data, self.airlines_mapping, self.airports_mapping, weather_processed