cyclone-pred-api / src /preprocess.py
clarindasusan's picture
Initial Deployment: Cyclone Prediction API
eb51ab4
import pandas as pd
import numpy as np
import os
INPUT_FILE = "data/cyclone_tracks_merged_full.csv"
OUTPUT_FILE = "data/preprocessed.csv"
def haversine_distance(lat1, lon1, lat2, lon2):
"""Calculate distance in km between two lat/lon points"""
R = 6371 # Earth radius in km
lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
return 2 * R * np.arcsin(np.sqrt(a))
def preprocess():
print("Loading data...")
df = pd.read_csv(INPUT_FILE)
print(f"Initial shape: {df.shape}")
# Drop rows with missing essential values
essential_cols = ["MAX_WIND", "MIN_PRESSURE", "LAT", "LON", "CYCLONE_NUMBER"]
df = df.dropna(subset=essential_cols)
print(f"After dropping missing essentials: {df.shape}")
# Convert DATE_TIME to datetime and sort BY CYCLONE THEN TIME
df["DATE_TIME"] = pd.to_datetime(df["DATE_TIME"])
df = df.sort_values(["CYCLONE_NUMBER", "DATE_TIME"])
df = df.reset_index(drop=True)
print(f"Unique cyclones: {df['CYCLONE_NUMBER'].nunique()}")
# ========================================================================
# TIME-BASED FEATURES
# ========================================================================
df["YEAR"] = df["DATE_TIME"].dt.year
df["MONTH"] = df["DATE_TIME"].dt.month
df["HOUR"] = df["DATE_TIME"].dt.hour
df["DAY_OF_YEAR"] = df["DATE_TIME"].dt.dayofyear
# Seasonal encoding (cyclical)
df["SIN_DOY"] = np.sin(2 * np.pi * df["DAY_OF_YEAR"] / 365)
df["COS_DOY"] = np.cos(2 * np.pi * df["DAY_OF_YEAR"] / 365)
df["SIN_HOUR"] = np.sin(2 * np.pi * df["HOUR"] / 24)
df["COS_HOUR"] = np.cos(2 * np.pi * df["HOUR"] / 24)
# ========================================================================
# STORM-LEVEL FEATURES
# ========================================================================
# Age of storm (hours since first observation)
df['STORM_AGE_HOURS'] = df.groupby('CYCLONE_NUMBER').cumcount() * 6
# Total storm duration
df['STORM_DURATION_HOURS'] = df.groupby('CYCLONE_NUMBER')['DATE_TIME'].transform('count') * 6
# ========================================================================
# GEOGRAPHIC FEATURES
# ========================================================================
# Distance to Odisha coast (approximate center: 20°N, 86°E)
ODISHA_LAT, ODISHA_LON = 20.0, 86.0
df['DIST_TO_ODISHA_KM'] = haversine_distance(
df['LAT'], df['LON'], ODISHA_LAT, ODISHA_LON
)
# Movement speed (km per 6 hours)
df['MOVEMENT_SPEED_KPH'] = df.groupby('CYCLONE_NUMBER').apply(
lambda x: haversine_distance(
x['LAT'].shift(1), x['LON'].shift(1), x['LAT'], x['LON']
) / 6
).reset_index(level=0, drop=True)
# ========================================================================
# LAG FEATURES (WITHIN EACH CYCLONE)
# ========================================================================
print("Creating lag features...")
# Lag features for MAX_WIND (within each cyclone)
for i in [1, 2, 3, 4]:
df[f"WIND_t-{i*6}"] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].shift(i)
# Lag features for MIN_PRESSURE (within each cyclone)
for i in [1, 2, 3, 4]:
df[f"PRESSURE_t-{i*6}"] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].shift(i)
# ========================================================================
# INTENSITY CHANGE FEATURES
# ========================================================================
# Rate of intensity change
df['WIND_CHANGE_6H'] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].diff()
df['PRESSURE_CHANGE_6H'] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].diff()
df['INTENSIFICATION_RATE'] = df['WIND_CHANGE_6H'] / 6 # knots per hour
# 12-hour change
df['WIND_CHANGE_12H'] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].diff(2)
df['PRESSURE_CHANGE_12H'] = df.groupby('CYCLONE_NUMBER')['MIN_PRESSURE'].diff(2)
# ========================================================================
# TARGET VARIABLE (24-HOUR AHEAD, WITHIN CYCLONE)
# ========================================================================
df["TARGET_24H"] = df.groupby('CYCLONE_NUMBER')['MAX_WIND'].shift(-4)
# ========================================================================
# CATEGORICAL FEATURES
# ========================================================================
# One-hot encode STORM_TYPE
storm_dummies = pd.get_dummies(df["STORM_TYPE"], prefix="STORM")
df = pd.concat([df, storm_dummies], axis=1)
# ========================================================================
# WIND RADII FEATURES
# ========================================================================
# Fill NaN with 0 (0 means no wind at that radius)
radius_features = [
"RAD_NE", "RAD_SE", "RAD_SW", "RAD_NW",
"RAD50_NE", "RAD50_SE", "RAD50_SW", "RAD50_NW",
"RAD64_NE", "RAD64_SE", "RAD64_SW", "RAD64_NW"
]
for col in radius_features:
if col in df.columns:
df[col] = df[col].fillna(0)
# Average radius at each wind threshold
df['AVG_RAD34'] = df[['RAD_NE', 'RAD_SE', 'RAD_SW', 'RAD_NW']].mean(axis=1)
df['AVG_RAD50'] = df[['RAD50_NE', 'RAD50_SE', 'RAD50_SW', 'RAD50_NW']].mean(axis=1)
df['AVG_RAD64'] = df[['RAD64_NE', 'RAD64_SE', 'RAD64_SW', 'RAD64_NW']].mean(axis=1)
# Storm size (maximum extent)
df['STORM_SIZE'] = df[['RAD_NE', 'RAD_SE', 'RAD_SW', 'RAD_NW']].max(axis=1)
# ========================================================================
# TRAIN/TEST SPLIT
# ========================================================================
# Use temporal split: train on earlier years, test on recent
# Adjust the cutoff year as needed
TRAIN_END_YEAR = 2020
df['SPLIT'] = 'train'
df.loc[df['YEAR'] > TRAIN_END_YEAR, 'SPLIT'] = 'test'
print(f"\nSplit distribution:")
print(df['SPLIT'].value_counts())
# ========================================================================
# DROP ROWS WITH MISSING CRITICAL VALUES
# ========================================================================
# Only drop if lag features or target are missing
# (This happens at start/end of each cyclone track)
critical_for_training = ["WIND_t-6", "PRESSURE_t-6", "TARGET_24H"]
print(f"\nBefore dropping NaN lags/target: {df.shape}")
df = df.dropna(subset=critical_for_training)
print(f"After dropping NaN lags/target: {df.shape}")
# ========================================================================
# SAVE OUTPUT
# ========================================================================
# Ensure output folder exists
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
# Save preprocessed CSV
df.to_csv(OUTPUT_FILE, index=False)
print("\n" + "="*60)
print("✅ PREPROCESSING COMPLETE")
print("="*60)
print(f"Output file: {OUTPUT_FILE}")
print(f"Final shape: {df.shape}")
print(f"Columns: {len(df.columns)}")
print(f"\nFeature categories:")
print(f" - Time features: YEAR, MONTH, HOUR, DAY_OF_YEAR, SIN/COS encodings")
print(f" - Storm features: AGE, DURATION, SIZE")
print(f" - Geographic: LAT, LON, DIST_TO_ODISHA, MOVEMENT_SPEED")
print(f" - Lag features: WIND_t-6/12/18/24, PRESSURE_t-6/12/18/24")
print(f" - Change features: WIND_CHANGE, INTENSIFICATION_RATE")
print(f" - Wind radii: {len(radius_features)} radius features + averages")
print(f" - Target: TARGET_24H (wind speed 24 hours ahead)")
print(f"\nTrain/Test split: {(df['SPLIT']=='train').sum()} train, {(df['SPLIT']=='test').sum()} test")
if __name__ == "__main__":
preprocess()