|
|
"""
|
|
|
Script 04: Feature Engineering
|
|
|
|
|
|
This script creates additional features for the model:
|
|
|
- Temporal features (month, season, day of week)
|
|
|
- Geospatial features (lat/lon bins, clustering, interactions)
|
|
|
- Coordinate transformations
|
|
|
|
|
|
Usage:
|
|
|
python scripts/04_feature_engineering.py
|
|
|
"""
|
|
|
|
|
|
import sys
|
|
|
from pathlib import Path
|
|
|
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
from sklearn.cluster import KMeans
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
|
|
|
|
|
project_root = Path(__file__).parent.parent
|
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
|
|
from config.config import (
|
|
|
TRAIN_PARQUET,
|
|
|
TEST_PARQUET,
|
|
|
FEATURES_PARQUET,
|
|
|
PROCESSED_DATA_DIR,
|
|
|
TARGET_COLUMN,
|
|
|
N_GEO_CLUSTERS,
|
|
|
LAT_BINS,
|
|
|
LON_BINS,
|
|
|
RANDOM_STATE
|
|
|
)
|
|
|
|
|
|
|
|
|
def load_data() -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
|
"""Load train and test data."""
|
|
|
print("Loading data...")
|
|
|
train_df = pd.read_parquet(TRAIN_PARQUET)
|
|
|
test_df = pd.read_parquet(TEST_PARQUET)
|
|
|
print(f" Train: {len(train_df):,} rows")
|
|
|
print(f" Test: {len(test_df):,} rows")
|
|
|
return train_df, test_df
|
|
|
|
|
|
|
|
|
def create_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
"""Create temporal features from DISCOVERY_DOY."""
|
|
|
print("\nCreating temporal features...")
|
|
|
|
|
|
|
|
|
|
|
|
reference_year = 2001
|
|
|
df['temp_date'] = pd.to_datetime(
|
|
|
df['DISCOVERY_DOY'].astype(int).astype(str) + f'-{reference_year}',
|
|
|
format='%j-%Y',
|
|
|
errors='coerce'
|
|
|
)
|
|
|
|
|
|
|
|
|
invalid_dates = df['temp_date'].isna().sum()
|
|
|
if invalid_dates > 0:
|
|
|
print(f" Warning: {invalid_dates} invalid day of year values")
|
|
|
|
|
|
median_doy = df['DISCOVERY_DOY'].median()
|
|
|
df.loc[df['temp_date'].isna(), 'temp_date'] = pd.to_datetime(
|
|
|
f'{int(median_doy)}-{reference_year}', format='%j-%Y'
|
|
|
)
|
|
|
|
|
|
|
|
|
df['month'] = df['temp_date'].dt.month
|
|
|
df['day_of_week'] = df['temp_date'].dt.dayofweek
|
|
|
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
|
|
|
|
|
|
|
|
|
df['season'] = df['month'].apply(lambda m:
|
|
|
1 if m in [12, 1, 2] else
|
|
|
2 if m in [3, 4, 5] else
|
|
|
3 if m in [6, 7, 8] else 4
|
|
|
)
|
|
|
|
|
|
|
|
|
df['is_fire_season'] = df['month'].isin([6, 7, 8, 9, 10]).astype(int)
|
|
|
|
|
|
|
|
|
df = df.drop(columns=['temp_date'])
|
|
|
|
|
|
print(" Created: month, day_of_week, is_weekend, season, is_fire_season")
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
def create_geospatial_features(train_df: pd.DataFrame, test_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, KMeans]:
|
|
|
"""Create geospatial features from coordinates."""
|
|
|
print("\nCreating geospatial features...")
|
|
|
|
|
|
|
|
|
print(" Creating coordinate bins...")
|
|
|
|
|
|
|
|
|
lat_min, lat_max = 24.0, 50.0
|
|
|
lon_min, lon_max = -125.0, -66.0
|
|
|
|
|
|
lat_edges = np.linspace(lat_min, lat_max, LAT_BINS + 1)
|
|
|
lon_edges = np.linspace(lon_min, lon_max, LON_BINS + 1)
|
|
|
|
|
|
for df in [train_df, test_df]:
|
|
|
df['lat_bin'] = pd.cut(df['LATITUDE'], bins=lat_edges, labels=False, include_lowest=True)
|
|
|
df['lon_bin'] = pd.cut(df['LONGITUDE'], bins=lon_edges, labels=False, include_lowest=True)
|
|
|
|
|
|
|
|
|
df['lat_bin'] = df['lat_bin'].fillna(df['lat_bin'].median()).astype(int)
|
|
|
df['lon_bin'] = df['lon_bin'].fillna(df['lon_bin'].median()).astype(int)
|
|
|
|
|
|
|
|
|
print(f" Fitting K-Means clustering (k={N_GEO_CLUSTERS})...")
|
|
|
|
|
|
|
|
|
train_coords = train_df[['LATITUDE', 'LONGITUDE']].values
|
|
|
test_coords = test_df[['LATITUDE', 'LONGITUDE']].values
|
|
|
|
|
|
|
|
|
scaler = StandardScaler()
|
|
|
train_coords_scaled = scaler.fit_transform(train_coords)
|
|
|
test_coords_scaled = scaler.transform(test_coords)
|
|
|
|
|
|
|
|
|
kmeans = KMeans(n_clusters=N_GEO_CLUSTERS, random_state=RANDOM_STATE, n_init=10)
|
|
|
train_df['geo_cluster'] = kmeans.fit_predict(train_coords_scaled)
|
|
|
test_df['geo_cluster'] = kmeans.predict(test_coords_scaled)
|
|
|
|
|
|
print(f" Cluster distribution (train):")
|
|
|
cluster_dist = train_df['geo_cluster'].value_counts().sort_index()
|
|
|
for cluster, count in cluster_dist.items():
|
|
|
pct = count / len(train_df) * 100
|
|
|
if pct >= 3:
|
|
|
print(f" Cluster {cluster}: {count:,} ({pct:.1f}%)")
|
|
|
|
|
|
|
|
|
print(" Creating coordinate interactions...")
|
|
|
|
|
|
for df in [train_df, test_df]:
|
|
|
|
|
|
df['lat_squared'] = df['LATITUDE'] ** 2
|
|
|
df['lon_squared'] = df['LONGITUDE'] ** 2
|
|
|
df['lat_lon_interaction'] = df['LATITUDE'] * df['LONGITUDE']
|
|
|
|
|
|
|
|
|
|
|
|
center_lat, center_lon = 39.8, -98.6
|
|
|
df['dist_from_center'] = np.sqrt(
|
|
|
(df['LATITUDE'] - center_lat) ** 2 +
|
|
|
(df['LONGITUDE'] - center_lon) ** 2
|
|
|
)
|
|
|
|
|
|
print(" Created: lat_bin, lon_bin, geo_cluster, lat_squared, lon_squared, lat_lon_interaction, dist_from_center")
|
|
|
|
|
|
return train_df, test_df, kmeans
|
|
|
|
|
|
|
|
|
def create_cyclical_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
"""Create cyclical encoding for periodic features."""
|
|
|
print("\nCreating cyclical features...")
|
|
|
|
|
|
|
|
|
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
|
|
|
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
|
|
|
|
|
|
|
|
|
df['doy_sin'] = np.sin(2 * np.pi * df['DISCOVERY_DOY'] / 365)
|
|
|
df['doy_cos'] = np.cos(2 * np.pi * df['DISCOVERY_DOY'] / 365)
|
|
|
|
|
|
|
|
|
df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
|
|
|
df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
|
|
|
|
|
|
print(" Created: month_sin/cos, doy_sin/cos, dow_sin/cos")
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
def create_year_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
"""Create year-based features."""
|
|
|
print("\nCreating year features...")
|
|
|
|
|
|
|
|
|
min_year, max_year = 1992, 2015
|
|
|
df['year_normalized'] = (df['FIRE_YEAR'] - min_year) / (max_year - min_year)
|
|
|
|
|
|
|
|
|
df['years_since_1992'] = df['FIRE_YEAR'] - min_year
|
|
|
|
|
|
print(" Created: year_normalized, years_since_1992")
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
def get_feature_columns(df: pd.DataFrame) -> list:
|
|
|
"""Get list of feature columns for modeling."""
|
|
|
|
|
|
exclude_cols = [
|
|
|
TARGET_COLUMN,
|
|
|
'NWCG_REPORTING_AGENCY', 'STAT_CAUSE_DESCR', 'STATE', 'OWNER_DESCR',
|
|
|
'COUNTY'
|
|
|
]
|
|
|
|
|
|
feature_cols = [col for col in df.columns if col not in exclude_cols]
|
|
|
return feature_cols
|
|
|
|
|
|
|
|
|
def save_data(train_df: pd.DataFrame, test_df: pd.DataFrame) -> None:
|
|
|
"""Save feature-engineered data."""
|
|
|
print("\nSaving feature-engineered data...")
|
|
|
|
|
|
|
|
|
train_df.to_parquet(TRAIN_PARQUET, index=False)
|
|
|
test_df.to_parquet(TEST_PARQUET, index=False)
|
|
|
|
|
|
print(f" Train data: {TRAIN_PARQUET}")
|
|
|
print(f" Test data: {TEST_PARQUET}")
|
|
|
|
|
|
|
|
|
combined = pd.concat([train_df, test_df], ignore_index=True)
|
|
|
combined.to_parquet(FEATURES_PARQUET, index=False)
|
|
|
print(f" Combined data: {FEATURES_PARQUET}")
|
|
|
|
|
|
|
|
|
def print_summary(train_df: pd.DataFrame) -> None:
|
|
|
"""Print feature engineering summary."""
|
|
|
print("\n" + "="*60)
|
|
|
print("FEATURE ENGINEERING SUMMARY")
|
|
|
print("="*60)
|
|
|
|
|
|
feature_cols = get_feature_columns(train_df)
|
|
|
|
|
|
print(f"\nTotal features: {len(feature_cols)}")
|
|
|
print("\nFeature list:")
|
|
|
|
|
|
|
|
|
temporal = [c for c in feature_cols if c in ['month', 'day_of_week', 'is_weekend', 'season', 'is_fire_season',
|
|
|
'month_sin', 'month_cos', 'doy_sin', 'doy_cos', 'dow_sin', 'dow_cos']]
|
|
|
geospatial = [c for c in feature_cols if c in ['lat_bin', 'lon_bin', 'geo_cluster', 'lat_squared', 'lon_squared',
|
|
|
'lat_lon_interaction', 'dist_from_center', 'LATITUDE', 'LONGITUDE']]
|
|
|
year_feats = [c for c in feature_cols if c in ['FIRE_YEAR', 'year_normalized', 'years_since_1992', 'DISCOVERY_DOY']]
|
|
|
encoded = [c for c in feature_cols if c.endswith('_encoded')]
|
|
|
|
|
|
print(f"\n Temporal ({len(temporal)}): {temporal}")
|
|
|
print(f"\n Geospatial ({len(geospatial)}): {geospatial}")
|
|
|
print(f"\n Year-based ({len(year_feats)}): {year_feats}")
|
|
|
print(f"\n Encoded categorical ({len(encoded)}): {encoded}")
|
|
|
|
|
|
|
|
|
def main():
|
|
|
"""Main feature engineering pipeline."""
|
|
|
print("\n" + "="*60)
|
|
|
print("FEATURE ENGINEERING")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
train_df, test_df = load_data()
|
|
|
|
|
|
|
|
|
train_df = create_temporal_features(train_df)
|
|
|
test_df = create_temporal_features(test_df)
|
|
|
|
|
|
|
|
|
train_df, test_df, kmeans = create_geospatial_features(train_df, test_df)
|
|
|
|
|
|
|
|
|
train_df = create_cyclical_features(train_df)
|
|
|
test_df = create_cyclical_features(test_df)
|
|
|
|
|
|
|
|
|
train_df = create_year_features(train_df)
|
|
|
test_df = create_year_features(test_df)
|
|
|
|
|
|
|
|
|
save_data(train_df, test_df)
|
|
|
|
|
|
|
|
|
print_summary(train_df)
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
print("✓ Feature Engineering Complete!")
|
|
|
print("="*60 + "\n")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|