wildfirez / scripts /04_feature_engineering.py
zakaneki's picture
first commit
64e892b verified
"""
Script 04: Feature Engineering
This script creates additional features for the model:
- Temporal features (month, season, day of week)
- Geospatial features (lat/lon bins, clustering, interactions)
- Coordinate transformations
Usage:
python scripts/04_feature_engineering.py
"""
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
from config.config import (
TRAIN_PARQUET,
TEST_PARQUET,
FEATURES_PARQUET,
PROCESSED_DATA_DIR,
TARGET_COLUMN,
N_GEO_CLUSTERS,
LAT_BINS,
LON_BINS,
RANDOM_STATE
)
def load_data() -> tuple[pd.DataFrame, pd.DataFrame]:
"""Load train and test data."""
print("Loading data...")
train_df = pd.read_parquet(TRAIN_PARQUET)
test_df = pd.read_parquet(TEST_PARQUET)
print(f" Train: {len(train_df):,} rows")
print(f" Test: {len(test_df):,} rows")
return train_df, test_df
def create_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
"""Create temporal features from DISCOVERY_DOY."""
print("\nCreating temporal features...")
# Convert day of year to datetime for feature extraction
# Using a non-leap year as reference
reference_year = 2001
df['temp_date'] = pd.to_datetime(
df['DISCOVERY_DOY'].astype(int).astype(str) + f'-{reference_year}',
format='%j-%Y',
errors='coerce'
)
# Handle invalid dates
invalid_dates = df['temp_date'].isna().sum()
if invalid_dates > 0:
print(f" Warning: {invalid_dates} invalid day of year values")
# Fill with median day
median_doy = df['DISCOVERY_DOY'].median()
df.loc[df['temp_date'].isna(), 'temp_date'] = pd.to_datetime(
f'{int(median_doy)}-{reference_year}', format='%j-%Y'
)
# Extract features
df['month'] = df['temp_date'].dt.month
df['day_of_week'] = df['temp_date'].dt.dayofweek # 0=Monday, 6=Sunday
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
# Season (1=Winter, 2=Spring, 3=Summer, 4=Fall)
df['season'] = df['month'].apply(lambda m:
1 if m in [12, 1, 2] else
2 if m in [3, 4, 5] else
3 if m in [6, 7, 8] else 4
)
# Fire season indicator (peak fire months: June-October)
df['is_fire_season'] = df['month'].isin([6, 7, 8, 9, 10]).astype(int)
# Drop temporary date column
df = df.drop(columns=['temp_date'])
print(" Created: month, day_of_week, is_weekend, season, is_fire_season")
return df
def create_geospatial_features(train_df: pd.DataFrame, test_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, KMeans]:
"""Create geospatial features from coordinates."""
print("\nCreating geospatial features...")
# 1. Latitude/Longitude bins
print(" Creating coordinate bins...")
# Define bin edges based on continental US bounds
lat_min, lat_max = 24.0, 50.0
lon_min, lon_max = -125.0, -66.0
lat_edges = np.linspace(lat_min, lat_max, LAT_BINS + 1)
lon_edges = np.linspace(lon_min, lon_max, LON_BINS + 1)
for df in [train_df, test_df]:
df['lat_bin'] = pd.cut(df['LATITUDE'], bins=lat_edges, labels=False, include_lowest=True)
df['lon_bin'] = pd.cut(df['LONGITUDE'], bins=lon_edges, labels=False, include_lowest=True)
# Fill NaN bins (locations outside continental US) with nearest bin
df['lat_bin'] = df['lat_bin'].fillna(df['lat_bin'].median()).astype(int)
df['lon_bin'] = df['lon_bin'].fillna(df['lon_bin'].median()).astype(int)
# 2. Geographic clustering using K-Means
print(f" Fitting K-Means clustering (k={N_GEO_CLUSTERS})...")
# Prepare coordinates for clustering
train_coords = train_df[['LATITUDE', 'LONGITUDE']].values
test_coords = test_df[['LATITUDE', 'LONGITUDE']].values
# Scale coordinates
scaler = StandardScaler()
train_coords_scaled = scaler.fit_transform(train_coords)
test_coords_scaled = scaler.transform(test_coords)
# Fit K-Means on train data
kmeans = KMeans(n_clusters=N_GEO_CLUSTERS, random_state=RANDOM_STATE, n_init=10)
train_df['geo_cluster'] = kmeans.fit_predict(train_coords_scaled)
test_df['geo_cluster'] = kmeans.predict(test_coords_scaled)
print(f" Cluster distribution (train):")
cluster_dist = train_df['geo_cluster'].value_counts().sort_index()
for cluster, count in cluster_dist.items():
pct = count / len(train_df) * 100
if pct >= 3: # Only show clusters with >= 3%
print(f" Cluster {cluster}: {count:,} ({pct:.1f}%)")
# 3. Coordinate interactions
print(" Creating coordinate interactions...")
for df in [train_df, test_df]:
# Quadratic terms (captures non-linear patterns)
df['lat_squared'] = df['LATITUDE'] ** 2
df['lon_squared'] = df['LONGITUDE'] ** 2
df['lat_lon_interaction'] = df['LATITUDE'] * df['LONGITUDE']
# Distance from geographic center of continental US
# Approximate center: 39.8°N, 98.6°W
center_lat, center_lon = 39.8, -98.6
df['dist_from_center'] = np.sqrt(
(df['LATITUDE'] - center_lat) ** 2 +
(df['LONGITUDE'] - center_lon) ** 2
)
print(" Created: lat_bin, lon_bin, geo_cluster, lat_squared, lon_squared, lat_lon_interaction, dist_from_center")
return train_df, test_df, kmeans
def create_cyclical_features(df: pd.DataFrame) -> pd.DataFrame:
"""Create cyclical encoding for periodic features."""
print("\nCreating cyclical features...")
# Cyclical encoding for month (captures January-December continuity)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
# Cyclical encoding for day of year
df['doy_sin'] = np.sin(2 * np.pi * df['DISCOVERY_DOY'] / 365)
df['doy_cos'] = np.cos(2 * np.pi * df['DISCOVERY_DOY'] / 365)
# Cyclical encoding for day of week
df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
print(" Created: month_sin/cos, doy_sin/cos, dow_sin/cos")
return df
def create_year_features(df: pd.DataFrame) -> pd.DataFrame:
"""Create year-based features."""
print("\nCreating year features...")
# Normalized year (0-1 scale for 1992-2015)
min_year, max_year = 1992, 2015
df['year_normalized'] = (df['FIRE_YEAR'] - min_year) / (max_year - min_year)
# Years since start
df['years_since_1992'] = df['FIRE_YEAR'] - min_year
print(" Created: year_normalized, years_since_1992")
return df
def get_feature_columns(df: pd.DataFrame) -> list:
"""Get list of feature columns for modeling."""
# Exclude target, original categorical text columns, and intermediate columns
exclude_cols = [
TARGET_COLUMN,
'NWCG_REPORTING_AGENCY', 'STAT_CAUSE_DESCR', 'STATE', 'OWNER_DESCR',
'COUNTY' # If present
]
feature_cols = [col for col in df.columns if col not in exclude_cols]
return feature_cols
def save_data(train_df: pd.DataFrame, test_df: pd.DataFrame) -> None:
"""Save feature-engineered data."""
print("\nSaving feature-engineered data...")
# Overwrite train/test files with new features
train_df.to_parquet(TRAIN_PARQUET, index=False)
test_df.to_parquet(TEST_PARQUET, index=False)
print(f" Train data: {TRAIN_PARQUET}")
print(f" Test data: {TEST_PARQUET}")
# Also save combined for reference
combined = pd.concat([train_df, test_df], ignore_index=True)
combined.to_parquet(FEATURES_PARQUET, index=False)
print(f" Combined data: {FEATURES_PARQUET}")
def print_summary(train_df: pd.DataFrame) -> None:
"""Print feature engineering summary."""
print("\n" + "="*60)
print("FEATURE ENGINEERING SUMMARY")
print("="*60)
feature_cols = get_feature_columns(train_df)
print(f"\nTotal features: {len(feature_cols)}")
print("\nFeature list:")
# Group features by type
temporal = [c for c in feature_cols if c in ['month', 'day_of_week', 'is_weekend', 'season', 'is_fire_season',
'month_sin', 'month_cos', 'doy_sin', 'doy_cos', 'dow_sin', 'dow_cos']]
geospatial = [c for c in feature_cols if c in ['lat_bin', 'lon_bin', 'geo_cluster', 'lat_squared', 'lon_squared',
'lat_lon_interaction', 'dist_from_center', 'LATITUDE', 'LONGITUDE']]
year_feats = [c for c in feature_cols if c in ['FIRE_YEAR', 'year_normalized', 'years_since_1992', 'DISCOVERY_DOY']]
encoded = [c for c in feature_cols if c.endswith('_encoded')]
print(f"\n Temporal ({len(temporal)}): {temporal}")
print(f"\n Geospatial ({len(geospatial)}): {geospatial}")
print(f"\n Year-based ({len(year_feats)}): {year_feats}")
print(f"\n Encoded categorical ({len(encoded)}): {encoded}")
def main():
"""Main feature engineering pipeline."""
print("\n" + "="*60)
print("FEATURE ENGINEERING")
print("="*60)
# Load data
train_df, test_df = load_data()
# Create temporal features
train_df = create_temporal_features(train_df)
test_df = create_temporal_features(test_df)
# Create geospatial features
train_df, test_df, kmeans = create_geospatial_features(train_df, test_df)
# Create cyclical features
train_df = create_cyclical_features(train_df)
test_df = create_cyclical_features(test_df)
# Create year features
train_df = create_year_features(train_df)
test_df = create_year_features(test_df)
# Save data
save_data(train_df, test_df)
# Print summary
print_summary(train_df)
print("\n" + "="*60)
print("✓ Feature Engineering Complete!")
print("="*60 + "\n")
if __name__ == "__main__":
main()