import pandas as pd import numpy as np import os INPUT_FILE = "data/cyclone_tracks_merged_full.csv" OUTPUT_FILE = "data/preprocessed.csv" def haversine_distance(lat1, lon1, lat2, lon2): """Calculate distance in km between two lat/lon points""" R = 6371 # Earth radius in km lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2]) dlat = lat2 - lat1 dlon = lon2 - lon1 a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2 return 2 * R * np.arcsin(np.sqrt(a)) def preprocess(): print("Loading data...") df = pd.read_csv(INPUT_FILE) print(f"Initial shape: {df.shape}") # Drop rows with missing essential values essential_cols = ["MAX_WIND", "MIN_PRESSURE", "LAT", "LON", "CYCLONE_NUMBER"] df = df.dropna(subset=essential_cols) print(f"After dropping missing essentials: {df.shape}") # Convert DATE_TIME to datetime and sort BY CYCLONE THEN TIME df["DATE_TIME"] = pd.to_datetime(df["DATE_TIME"]) df = df.sort_values(["CYCLONE_NUMBER", "DATE_TIME"]) df = df.reset_index(drop=True) print(f"Unique cyclones: {df['CYCLONE_NUMBER'].nunique()}") # ======================================================================== # TIME-BASED FEATURES # ======================================================================== df["YEAR"] = df["DATE_TIME"].dt.year df["MONTH"] = df["DATE_TIME"].dt.month df["HOUR"] = df["DATE_TIME"].dt.hour df["DAY_OF_YEAR"] = df["DATE_TIME"].dt.dayofyear # Seasonal encoding (cyclical) df["SIN_DOY"] = np.sin(2 * np.pi * df["DAY_OF_YEAR"] / 365) df["COS_DOY"] = np.cos(2 * np.pi * df["DAY_OF_YEAR"] / 365) df["SIN_HOUR"] = np.sin(2 * np.pi * df["HOUR"] / 24) df["COS_HOUR"] = np.cos(2 * np.pi * df["HOUR"] / 24) # ======================================================================== # STORM-LEVEL FEATURES # ======================================================================== # Age of storm (hours since first observation) df['STORM_AGE_HOURS'] = df.groupby('CYCLONE_NUMBER').cumcount() * 6 # Total storm duration df['STORM_DURATION_HOURS'] = df.groupby('CYCLONE_NUMBER')['DATE_TIME'].transform('count') * 6 # ======================================================================== # GEOGRAPHIC FEATURES # ======================================================================== # Distance to Odisha coast (approximate center: 20°N, 86°E) ODISHA_LAT, ODISHA_LON = 20.0, 86.0 df['DIST_TO_ODISHA_KM'] = haversine_distance( df['LAT'], df['LON'], ODISHA_LAT, ODISHA_LON ) # Movement speed (km per 6 hours) df['MOVEMENT_SPEED_KPH'] = df.groupby('CYCLONE_NUMBER').apply( lambda x: haversine_distance( x['LAT'].shift(1), x['LON'].shift(1), x['LAT'], x['LON'] ) / 6 ).reset_index(level=0, drop=True) # ======================================================================== # LAG FEATURES (WITHIN EACH CYCLONE) # ======================================================================== print("Creating lag features...") # Lag features for MAX_WIND (within each cyclone) for i in [1, 2, 3, 4]: df[f"WIND_t-{i*6}"] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].shift(i) # Lag features for MIN_PRESSURE (within each cyclone) for i in [1, 2, 3, 4]: df[f"PRESSURE_t-{i*6}"] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].shift(i) # ======================================================================== # INTENSITY CHANGE FEATURES # ======================================================================== # Rate of intensity change df['WIND_CHANGE_6H'] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].diff() df['PRESSURE_CHANGE_6H'] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].diff() df['INTENSIFICATION_RATE'] = df['WIND_CHANGE_6H'] / 6 # knots per hour # 12-hour change df['WIND_CHANGE_12H'] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].diff(2) df['PRESSURE_CHANGE_12H'] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].diff(2) # ======================================================================== # TARGET VARIABLE (24-HOUR AHEAD, WITHIN CYCLONE) # ======================================================================== df["TARGET_24H"] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].shift(-4) # ======================================================================== # CATEGORICAL FEATURES # ======================================================================== # One-hot encode STORM_TYPE storm_dummies = pd.get_dummies(df["STORM_TYPE"], prefix="STORM") df = pd.concat([df, storm_dummies], axis=1) # ======================================================================== # WIND RADII FEATURES # ======================================================================== # Fill NaN with 0 (0 means no wind at that radius) radius_features = [ "RAD_NE", "RAD_SE", "RAD_SW", "RAD_NW", "RAD50_NE", "RAD50_SE", "RAD50_SW", "RAD50_NW", "RAD64_NE", "RAD64_SE", "RAD64_SW", "RAD64_NW" ] for col in radius_features: if col in df.columns: df[col] = df[col].fillna(0) # Average radius at each wind threshold df['AVG_RAD34'] = df[['RAD_NE', 'RAD_SE', 'RAD_SW', 'RAD_NW']].mean(axis=1) df['AVG_RAD50'] = df[['RAD50_NE', 'RAD50_SE', 'RAD50_SW', 'RAD50_NW']].mean(axis=1) df['AVG_RAD64'] = df[['RAD64_NE', 'RAD64_SE', 'RAD64_SW', 'RAD64_NW']].mean(axis=1) # Storm size (maximum extent) df['STORM_SIZE'] = df[['RAD_NE', 'RAD_SE', 'RAD_SW', 'RAD_NW']].max(axis=1) # ======================================================================== # TRAIN/TEST SPLIT # ======================================================================== # Use temporal split: train on earlier years, test on recent # Adjust the cutoff year as needed TRAIN_END_YEAR = 2020 df['SPLIT'] = 'train' df.loc[df['YEAR'] > TRAIN_END_YEAR, 'SPLIT'] = 'test' print(f"\nSplit distribution:") print(df['SPLIT'].value_counts()) # ======================================================================== # DROP ROWS WITH MISSING CRITICAL VALUES # ======================================================================== # Only drop if lag features or target are missing # (This happens at start/end of each cyclone track) critical_for_training = ["WIND_t-6", "PRESSURE_t-6", "TARGET_24H"] print(f"\nBefore dropping NaN lags/target: {df.shape}") df = df.dropna(subset=critical_for_training) print(f"After dropping NaN lags/target: {df.shape}") # ======================================================================== # SAVE OUTPUT # ======================================================================== # Ensure output folder exists os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) # Save preprocessed CSV df.to_csv(OUTPUT_FILE, index=False) print("\n" + "="*60) print("✅ PREPROCESSING COMPLETE") print("="*60) print(f"Output file: {OUTPUT_FILE}") print(f"Final shape: {df.shape}") print(f"Columns: {len(df.columns)}") print(f"\nFeature categories:") print(f" - Time features: YEAR, MONTH, HOUR, DAY_OF_YEAR, SIN/COS encodings") print(f" - Storm features: AGE, DURATION, SIZE") print(f" - Geographic: LAT, LON, DIST_TO_ODISHA, MOVEMENT_SPEED") print(f" - Lag features: WIND_t-6/12/18/24, PRESSURE_t-6/12/18/24") print(f" - Change features: WIND_CHANGE, INTENSIFICATION_RATE") print(f" - Wind radii: {len(radius_features)} radius features + averages") print(f" - Target: TARGET_24H (wind speed 24 hours ahead)") print(f"\nTrain/Test split: {(df['SPLIT']=='train').sum()} train, {(df['SPLIT']=='test').sum()} test") if __name__ == "__main__": preprocess()