File size: 7,976 Bytes
eb51ab4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import pandas as pd
import numpy as np
import os

INPUT_FILE = "data/cyclone_tracks_merged_full.csv"
OUTPUT_FILE = "data/preprocessed.csv"

def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate distance in km between two lat/lon points"""
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

def preprocess():
    print("Loading data...")
    df = pd.read_csv(INPUT_FILE)
    print(f"Initial shape: {df.shape}")
    
    # Drop rows with missing essential values
    essential_cols = ["MAX_WIND", "MIN_PRESSURE", "LAT", "LON", "CYCLONE_NUMBER"]
    df = df.dropna(subset=essential_cols)
    print(f"After dropping missing essentials: {df.shape}")

    # Convert DATE_TIME to datetime and sort BY CYCLONE THEN TIME
    df["DATE_TIME"] = pd.to_datetime(df["DATE_TIME"])
    df = df.sort_values(["CYCLONE_NUMBER", "DATE_TIME"])
    df = df.reset_index(drop=True)
    
    print(f"Unique cyclones: {df['CYCLONE_NUMBER'].nunique()}")

    # ========================================================================
    # TIME-BASED FEATURES
    # ========================================================================
    df["YEAR"] = df["DATE_TIME"].dt.year
    df["MONTH"] = df["DATE_TIME"].dt.month
    df["HOUR"] = df["DATE_TIME"].dt.hour
    df["DAY_OF_YEAR"] = df["DATE_TIME"].dt.dayofyear

    # Seasonal encoding (cyclical)
    df["SIN_DOY"] = np.sin(2 * np.pi * df["DAY_OF_YEAR"] / 365)
    df["COS_DOY"] = np.cos(2 * np.pi * df["DAY_OF_YEAR"] / 365)
    df["SIN_HOUR"] = np.sin(2 * np.pi * df["HOUR"] / 24)
    df["COS_HOUR"] = np.cos(2 * np.pi * df["HOUR"] / 24)

    # ========================================================================
    # STORM-LEVEL FEATURES
    # ========================================================================
    # Age of storm (hours since first observation)
    df['STORM_AGE_HOURS'] = df.groupby('CYCLONE_NUMBER').cumcount() * 6
    
    # Total storm duration
    df['STORM_DURATION_HOURS'] = df.groupby('CYCLONE_NUMBER')['DATE_TIME'].transform('count') * 6

    # ========================================================================
    # GEOGRAPHIC FEATURES
    # ========================================================================
    # Distance to Odisha coast (approximate center: 20°N, 86°E)
    ODISHA_LAT, ODISHA_LON = 20.0, 86.0
    df['DIST_TO_ODISHA_KM'] = haversine_distance(
        df['LAT'], df['LON'], ODISHA_LAT, ODISHA_LON
    )
    
    # Movement speed (km per 6 hours)
    df['MOVEMENT_SPEED_KPH'] = df.groupby('CYCLONE_NUMBER').apply(
        lambda x: haversine_distance(
            x['LAT'].shift(1), x['LON'].shift(1), x['LAT'], x['LON']
        ) / 6
    ).reset_index(level=0, drop=True)

    # ========================================================================
    # LAG FEATURES (WITHIN EACH CYCLONE)
    # ========================================================================
    print("Creating lag features...")
    
    # Lag features for MAX_WIND (within each cyclone)
    for i in [1, 2, 3, 4]:
        df[f"WIND_t-{i*6}"] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].shift(i)
    
    # Lag features for MIN_PRESSURE (within each cyclone)
    for i in [1, 2, 3, 4]:
        df[f"PRESSURE_t-{i*6}"] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].shift(i)

    # ========================================================================
    # INTENSITY CHANGE FEATURES
    # ========================================================================
    # Rate of intensity change
    df['WIND_CHANGE_6H'] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].diff()
    df['PRESSURE_CHANGE_6H'] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].diff()
    df['INTENSIFICATION_RATE'] = df['WIND_CHANGE_6H'] / 6  # knots per hour
    
    # 12-hour change
    df['WIND_CHANGE_12H'] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].diff(2)
    df['PRESSURE_CHANGE_12H'] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].diff(2)

    # ========================================================================
    # TARGET VARIABLE (24-HOUR AHEAD, WITHIN CYCLONE)
    # ========================================================================
    df["TARGET_24H"] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].shift(-4)

    # ========================================================================
    # CATEGORICAL FEATURES
    # ========================================================================
    # One-hot encode STORM_TYPE
    storm_dummies = pd.get_dummies(df["STORM_TYPE"], prefix="STORM")
    df = pd.concat([df, storm_dummies], axis=1)

    # ========================================================================
    # WIND RADII FEATURES
    # ========================================================================
    # Fill NaN with 0 (0 means no wind at that radius)
    radius_features = [
        "RAD_NE", "RAD_SE", "RAD_SW", "RAD_NW",
        "RAD50_NE", "RAD50_SE", "RAD50_SW", "RAD50_NW",
        "RAD64_NE", "RAD64_SE", "RAD64_SW", "RAD64_NW"
    ]
    
    for col in radius_features:
        if col in df.columns:
            df[col] = df[col].fillna(0)
    
    # Average radius at each wind threshold
    df['AVG_RAD34'] = df[['RAD_NE', 'RAD_SE', 'RAD_SW', 'RAD_NW']].mean(axis=1)
    df['AVG_RAD50'] = df[['RAD50_NE', 'RAD50_SE', 'RAD50_SW', 'RAD50_NW']].mean(axis=1)
    df['AVG_RAD64'] = df[['RAD64_NE', 'RAD64_SE', 'RAD64_SW', 'RAD64_NW']].mean(axis=1)
    
    # Storm size (maximum extent)
    df['STORM_SIZE'] = df[['RAD_NE', 'RAD_SE', 'RAD_SW', 'RAD_NW']].max(axis=1)

    # ========================================================================
    # TRAIN/TEST SPLIT
    # ========================================================================
    # Use temporal split: train on earlier years, test on recent
    # Adjust the cutoff year as needed
    TRAIN_END_YEAR = 2020
    df['SPLIT'] = 'train'
    df.loc[df['YEAR'] > TRAIN_END_YEAR, 'SPLIT'] = 'test'
    
    print(f"\nSplit distribution:")
    print(df['SPLIT'].value_counts())

    # ========================================================================
    # DROP ROWS WITH MISSING CRITICAL VALUES
    # ========================================================================
    # Only drop if lag features or target are missing
    # (This happens at start/end of each cyclone track)
    critical_for_training = ["WIND_t-6", "PRESSURE_t-6", "TARGET_24H"]
    
    print(f"\nBefore dropping NaN lags/target: {df.shape}")
    df = df.dropna(subset=critical_for_training)
    print(f"After dropping NaN lags/target: {df.shape}")

    # ========================================================================
    # SAVE OUTPUT
    # ========================================================================
    # Ensure output folder exists
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

    # Save preprocessed CSV
    df.to_csv(OUTPUT_FILE, index=False)
    
    print("\n" + "="*60)
    print("✅ PREPROCESSING COMPLETE")
    print("="*60)
    print(f"Output file: {OUTPUT_FILE}")
    print(f"Final shape: {df.shape}")
    print(f"Columns: {len(df.columns)}")
    print(f"\nFeature categories:")
    print(f"  - Time features: YEAR, MONTH, HOUR, DAY_OF_YEAR, SIN/COS encodings")
    print(f"  - Storm features: AGE, DURATION, SIZE")
    print(f"  - Geographic: LAT, LON, DIST_TO_ODISHA, MOVEMENT_SPEED")
    print(f"  - Lag features: WIND_t-6/12/18/24, PRESSURE_t-6/12/18/24")
    print(f"  - Change features: WIND_CHANGE, INTENSIFICATION_RATE")
    print(f"  - Wind radii: {len(radius_features)} radius features + averages")
    print(f"  - Target: TARGET_24H (wind speed 24 hours ahead)")
    print(f"\nTrain/Test split: {(df['SPLIT']=='train').sum()} train, {(df['SPLIT']=='test').sum()} test")

if __name__ == "__main__":
    preprocess()