Spaces:
Sleeping
Sleeping
File size: 7,976 Bytes
eb51ab4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | import pandas as pd
import numpy as np
import os
INPUT_FILE = "data/cyclone_tracks_merged_full.csv"
OUTPUT_FILE = "data/preprocessed.csv"
def haversine_distance(lat1, lon1, lat2, lon2):
"""Calculate distance in km between two lat/lon points"""
R = 6371 # Earth radius in km
lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
return 2 * R * np.arcsin(np.sqrt(a))
def preprocess():
print("Loading data...")
df = pd.read_csv(INPUT_FILE)
print(f"Initial shape: {df.shape}")
# Drop rows with missing essential values
essential_cols = ["MAX_WIND", "MIN_PRESSURE", "LAT", "LON", "CYCLONE_NUMBER"]
df = df.dropna(subset=essential_cols)
print(f"After dropping missing essentials: {df.shape}")
# Convert DATE_TIME to datetime and sort BY CYCLONE THEN TIME
df["DATE_TIME"] = pd.to_datetime(df["DATE_TIME"])
df = df.sort_values(["CYCLONE_NUMBER", "DATE_TIME"])
df = df.reset_index(drop=True)
print(f"Unique cyclones: {df['CYCLONE_NUMBER'].nunique()}")
# ========================================================================
# TIME-BASED FEATURES
# ========================================================================
df["YEAR"] = df["DATE_TIME"].dt.year
df["MONTH"] = df["DATE_TIME"].dt.month
df["HOUR"] = df["DATE_TIME"].dt.hour
df["DAY_OF_YEAR"] = df["DATE_TIME"].dt.dayofyear
# Seasonal encoding (cyclical)
df["SIN_DOY"] = np.sin(2 * np.pi * df["DAY_OF_YEAR"] / 365)
df["COS_DOY"] = np.cos(2 * np.pi * df["DAY_OF_YEAR"] / 365)
df["SIN_HOUR"] = np.sin(2 * np.pi * df["HOUR"] / 24)
df["COS_HOUR"] = np.cos(2 * np.pi * df["HOUR"] / 24)
# ========================================================================
# STORM-LEVEL FEATURES
# ========================================================================
# Age of storm (hours since first observation)
df['STORM_AGE_HOURS'] = df.groupby('CYCLONE_NUMBER').cumcount() * 6
# Total storm duration
df['STORM_DURATION_HOURS'] = df.groupby('CYCLONE_NUMBER')['DATE_TIME'].transform('count') * 6
# ========================================================================
# GEOGRAPHIC FEATURES
# ========================================================================
# Distance to Odisha coast (approximate center: 20°N, 86°E)
ODISHA_LAT, ODISHA_LON = 20.0, 86.0
df['DIST_TO_ODISHA_KM'] = haversine_distance(
df['LAT'], df['LON'], ODISHA_LAT, ODISHA_LON
)
# Movement speed (km per 6 hours)
df['MOVEMENT_SPEED_KPH'] = df.groupby('CYCLONE_NUMBER').apply(
lambda x: haversine_distance(
x['LAT'].shift(1), x['LON'].shift(1), x['LAT'], x['LON']
) / 6
).reset_index(level=0, drop=True)
# ========================================================================
# LAG FEATURES (WITHIN EACH CYCLONE)
# ========================================================================
print("Creating lag features...")
# Lag features for MAX_WIND (within each cyclone)
for i in [1, 2, 3, 4]:
df[f"WIND_t-{i*6}"] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].shift(i)
# Lag features for MIN_PRESSURE (within each cyclone)
for i in [1, 2, 3, 4]:
df[f"PRESSURE_t-{i*6}"] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].shift(i)
# ========================================================================
# INTENSITY CHANGE FEATURES
# ========================================================================
# Rate of intensity change
df['WIND_CHANGE_6H'] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].diff()
df['PRESSURE_CHANGE_6H'] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].diff()
df['INTENSIFICATION_RATE'] = df['WIND_CHANGE_6H'] / 6 # knots per hour
# 12-hour change
df['WIND_CHANGE_12H'] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].diff(2)
df['PRESSURE_CHANGE_12H'] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].diff(2)
# ========================================================================
# TARGET VARIABLE (24-HOUR AHEAD, WITHIN CYCLONE)
# ========================================================================
df["TARGET_24H"] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].shift(-4)
# ========================================================================
# CATEGORICAL FEATURES
# ========================================================================
# One-hot encode STORM_TYPE
storm_dummies = pd.get_dummies(df["STORM_TYPE"], prefix="STORM")
df = pd.concat([df, storm_dummies], axis=1)
# ========================================================================
# WIND RADII FEATURES
# ========================================================================
# Fill NaN with 0 (0 means no wind at that radius)
radius_features = [
"RAD_NE", "RAD_SE", "RAD_SW", "RAD_NW",
"RAD50_NE", "RAD50_SE", "RAD50_SW", "RAD50_NW",
"RAD64_NE", "RAD64_SE", "RAD64_SW", "RAD64_NW"
]
for col in radius_features:
if col in df.columns:
df[col] = df[col].fillna(0)
# Average radius at each wind threshold
df['AVG_RAD34'] = df[['RAD_NE', 'RAD_SE', 'RAD_SW', 'RAD_NW']].mean(axis=1)
df['AVG_RAD50'] = df[['RAD50_NE', 'RAD50_SE', 'RAD50_SW', 'RAD50_NW']].mean(axis=1)
df['AVG_RAD64'] = df[['RAD64_NE', 'RAD64_SE', 'RAD64_SW', 'RAD64_NW']].mean(axis=1)
# Storm size (maximum extent)
df['STORM_SIZE'] = df[['RAD_NE', 'RAD_SE', 'RAD_SW', 'RAD_NW']].max(axis=1)
# ========================================================================
# TRAIN/TEST SPLIT
# ========================================================================
# Use temporal split: train on earlier years, test on recent
# Adjust the cutoff year as needed
TRAIN_END_YEAR = 2020
df['SPLIT'] = 'train'
df.loc[df['YEAR'] > TRAIN_END_YEAR, 'SPLIT'] = 'test'
print(f"\nSplit distribution:")
print(df['SPLIT'].value_counts())
# ========================================================================
# DROP ROWS WITH MISSING CRITICAL VALUES
# ========================================================================
# Only drop if lag features or target are missing
# (This happens at start/end of each cyclone track)
critical_for_training = ["WIND_t-6", "PRESSURE_t-6", "TARGET_24H"]
print(f"\nBefore dropping NaN lags/target: {df.shape}")
df = df.dropna(subset=critical_for_training)
print(f"After dropping NaN lags/target: {df.shape}")
# ========================================================================
# SAVE OUTPUT
# ========================================================================
# Ensure output folder exists
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
# Save preprocessed CSV
df.to_csv(OUTPUT_FILE, index=False)
print("\n" + "="*60)
print("✅ PREPROCESSING COMPLETE")
print("="*60)
print(f"Output file: {OUTPUT_FILE}")
print(f"Final shape: {df.shape}")
print(f"Columns: {len(df.columns)}")
print(f"\nFeature categories:")
print(f" - Time features: YEAR, MONTH, HOUR, DAY_OF_YEAR, SIN/COS encodings")
print(f" - Storm features: AGE, DURATION, SIZE")
print(f" - Geographic: LAT, LON, DIST_TO_ODISHA, MOVEMENT_SPEED")
print(f" - Lag features: WIND_t-6/12/18/24, PRESSURE_t-6/12/18/24")
print(f" - Change features: WIND_CHANGE, INTENSIFICATION_RATE")
print(f" - Wind radii: {len(radius_features)} radius features + averages")
print(f" - Target: TARGET_24H (wind speed 24 hours ahead)")
print(f"\nTrain/Test split: {(df['SPLIT']=='train').sum()} train, {(df['SPLIT']=='test').sum()} test")
if __name__ == "__main__":
preprocess() |