Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import os | |
| INPUT_FILE = "data/cyclone_tracks_merged_full.csv" | |
| OUTPUT_FILE = "data/preprocessed.csv" | |
| def haversine_distance(lat1, lon1, lat2, lon2): | |
| """Calculate distance in km between two lat/lon points""" | |
| R = 6371 # Earth radius in km | |
| lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2]) | |
| dlat = lat2 - lat1 | |
| dlon = lon2 - lon1 | |
| a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2 | |
| return 2 * R * np.arcsin(np.sqrt(a)) | |
| def preprocess(): | |
| print("Loading data...") | |
| df = pd.read_csv(INPUT_FILE) | |
| print(f"Initial shape: {df.shape}") | |
| # Drop rows with missing essential values | |
| essential_cols = ["MAX_WIND", "MIN_PRESSURE", "LAT", "LON", "CYCLONE_NUMBER"] | |
| df = df.dropna(subset=essential_cols) | |
| print(f"After dropping missing essentials: {df.shape}") | |
| # Convert DATE_TIME to datetime and sort BY CYCLONE THEN TIME | |
| df["DATE_TIME"] = pd.to_datetime(df["DATE_TIME"]) | |
| df = df.sort_values(["CYCLONE_NUMBER", "DATE_TIME"]) | |
| df = df.reset_index(drop=True) | |
| print(f"Unique cyclones: {df['CYCLONE_NUMBER'].nunique()}") | |
| # ======================================================================== | |
| # TIME-BASED FEATURES | |
| # ======================================================================== | |
| df["YEAR"] = df["DATE_TIME"].dt.year | |
| df["MONTH"] = df["DATE_TIME"].dt.month | |
| df["HOUR"] = df["DATE_TIME"].dt.hour | |
| df["DAY_OF_YEAR"] = df["DATE_TIME"].dt.dayofyear | |
| # Seasonal encoding (cyclical) | |
| df["SIN_DOY"] = np.sin(2 * np.pi * df["DAY_OF_YEAR"] / 365) | |
| df["COS_DOY"] = np.cos(2 * np.pi * df["DAY_OF_YEAR"] / 365) | |
| df["SIN_HOUR"] = np.sin(2 * np.pi * df["HOUR"] / 24) | |
| df["COS_HOUR"] = np.cos(2 * np.pi * df["HOUR"] / 24) | |
| # ======================================================================== | |
| # STORM-LEVEL FEATURES | |
| # ======================================================================== | |
| # Age of storm (hours since first observation) | |
| df['STORM_AGE_HOURS'] = df.groupby('CYCLONE_NUMBER').cumcount() * 6 | |
| # Total storm duration | |
| df['STORM_DURATION_HOURS'] = df.groupby('CYCLONE_NUMBER')['DATE_TIME'].transform('count') * 6 | |
| # ======================================================================== | |
| # GEOGRAPHIC FEATURES | |
| # ======================================================================== | |
| # Distance to Odisha coast (approximate center: 20°N, 86°E) | |
| ODISHA_LAT, ODISHA_LON = 20.0, 86.0 | |
| df['DIST_TO_ODISHA_KM'] = haversine_distance( | |
| df['LAT'], df['LON'], ODISHA_LAT, ODISHA_LON | |
| ) | |
| # Movement speed (km per 6 hours) | |
| df['MOVEMENT_SPEED_KPH'] = df.groupby('CYCLONE_NUMBER').apply( | |
| lambda x: haversine_distance( | |
| x['LAT'].shift(1), x['LON'].shift(1), x['LAT'], x['LON'] | |
| ) / 6 | |
| ).reset_index(level=0, drop=True) | |
| # ======================================================================== | |
| # LAG FEATURES (WITHIN EACH CYCLONE) | |
| # ======================================================================== | |
| print("Creating lag features...") | |
| # Lag features for MAX_WIND (within each cyclone) | |
| for i in [1, 2, 3, 4]: | |
| df[f"WIND_t-{i*6}"] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].shift(i) | |
| # Lag features for MIN_PRESSURE (within each cyclone) | |
| for i in [1, 2, 3, 4]: | |
| df[f"PRESSURE_t-{i*6}"] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].shift(i) | |
| # ======================================================================== | |
| # INTENSITY CHANGE FEATURES | |
| # ======================================================================== | |
| # Rate of intensity change | |
| df['WIND_CHANGE_6H'] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].diff() | |
| df['PRESSURE_CHANGE_6H'] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].diff() | |
| df['INTENSIFICATION_RATE'] = df['WIND_CHANGE_6H'] / 6 # knots per hour | |
| # 12-hour change | |
| df['WIND_CHANGE_12H'] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].diff(2) | |
| df['PRESSURE_CHANGE_12H'] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].diff(2) | |
| # ======================================================================== | |
| # TARGET VARIABLE (24-HOUR AHEAD, WITHIN CYCLONE) | |
| # ======================================================================== | |
| df["TARGET_24H"] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].shift(-4) | |
| # ======================================================================== | |
| # CATEGORICAL FEATURES | |
| # ======================================================================== | |
| # One-hot encode STORM_TYPE | |
| storm_dummies = pd.get_dummies(df["STORM_TYPE"], prefix="STORM") | |
| df = pd.concat([df, storm_dummies], axis=1) | |
| # ======================================================================== | |
| # WIND RADII FEATURES | |
| # ======================================================================== | |
| # Fill NaN with 0 (0 means no wind at that radius) | |
| radius_features = [ | |
| "RAD_NE", "RAD_SE", "RAD_SW", "RAD_NW", | |
| "RAD50_NE", "RAD50_SE", "RAD50_SW", "RAD50_NW", | |
| "RAD64_NE", "RAD64_SE", "RAD64_SW", "RAD64_NW" | |
| ] | |
| for col in radius_features: | |
| if col in df.columns: | |
| df[col] = df[col].fillna(0) | |
| # Average radius at each wind threshold | |
| df['AVG_RAD34'] = df[['RAD_NE', 'RAD_SE', 'RAD_SW', 'RAD_NW']].mean(axis=1) | |
| df['AVG_RAD50'] = df[['RAD50_NE', 'RAD50_SE', 'RAD50_SW', 'RAD50_NW']].mean(axis=1) | |
| df['AVG_RAD64'] = df[['RAD64_NE', 'RAD64_SE', 'RAD64_SW', 'RAD64_NW']].mean(axis=1) | |
| # Storm size (maximum extent) | |
| df['STORM_SIZE'] = df[['RAD_NE', 'RAD_SE', 'RAD_SW', 'RAD_NW']].max(axis=1) | |
| # ======================================================================== | |
| # TRAIN/TEST SPLIT | |
| # ======================================================================== | |
| # Use temporal split: train on earlier years, test on recent | |
| # Adjust the cutoff year as needed | |
| TRAIN_END_YEAR = 2020 | |
| df['SPLIT'] = 'train' | |
| df.loc[df['YEAR'] > TRAIN_END_YEAR, 'SPLIT'] = 'test' | |
| print(f"\nSplit distribution:") | |
| print(df['SPLIT'].value_counts()) | |
| # ======================================================================== | |
| # DROP ROWS WITH MISSING CRITICAL VALUES | |
| # ======================================================================== | |
| # Only drop if lag features or target are missing | |
| # (This happens at start/end of each cyclone track) | |
| critical_for_training = ["WIND_t-6", "PRESSURE_t-6", "TARGET_24H"] | |
| print(f"\nBefore dropping NaN lags/target: {df.shape}") | |
| df = df.dropna(subset=critical_for_training) | |
| print(f"After dropping NaN lags/target: {df.shape}") | |
| # ======================================================================== | |
| # SAVE OUTPUT | |
| # ======================================================================== | |
| # Ensure output folder exists | |
| os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) | |
| # Save preprocessed CSV | |
| df.to_csv(OUTPUT_FILE, index=False) | |
| print("\n" + "="*60) | |
| print("✅ PREPROCESSING COMPLETE") | |
| print("="*60) | |
| print(f"Output file: {OUTPUT_FILE}") | |
| print(f"Final shape: {df.shape}") | |
| print(f"Columns: {len(df.columns)}") | |
| print(f"\nFeature categories:") | |
| print(f" - Time features: YEAR, MONTH, HOUR, DAY_OF_YEAR, SIN/COS encodings") | |
| print(f" - Storm features: AGE, DURATION, SIZE") | |
| print(f" - Geographic: LAT, LON, DIST_TO_ODISHA, MOVEMENT_SPEED") | |
| print(f" - Lag features: WIND_t-6/12/18/24, PRESSURE_t-6/12/18/24") | |
| print(f" - Change features: WIND_CHANGE, INTENSIFICATION_RATE") | |
| print(f" - Wind radii: {len(radius_features)} radius features + averages") | |
| print(f" - Target: TARGET_24H (wind speed 24 hours ahead)") | |
| print(f"\nTrain/Test split: {(df['SPLIT']=='train').sum()} train, {(df['SPLIT']=='test').sum()} test") | |
| if __name__ == "__main__": | |
| preprocess() |