File size: 10,489 Bytes

64e892b

"""

Script 04: Feature Engineering



This script creates additional features for the model:

- Temporal features (month, season, day of week)

- Geospatial features (lat/lon bins, clustering, interactions)

- Coordinate transformations



Usage:

    python scripts/04_feature_engineering.py

"""

import sys
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

from config.config import (
    TRAIN_PARQUET,
    TEST_PARQUET,
    FEATURES_PARQUET,
    PROCESSED_DATA_DIR,
    TARGET_COLUMN,
    N_GEO_CLUSTERS,
    LAT_BINS,
    LON_BINS,
    RANDOM_STATE
)


def load_data() -> tuple[pd.DataFrame, pd.DataFrame]:
    """Load train and test data."""
    print("Loading data...")
    train_df = pd.read_parquet(TRAIN_PARQUET)
    test_df = pd.read_parquet(TEST_PARQUET)
    print(f"  Train: {len(train_df):,} rows")
    print(f"  Test: {len(test_df):,} rows")
    return train_df, test_df


def create_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create temporal features from DISCOVERY_DOY."""
    print("\nCreating temporal features...")
    
    # Convert day of year to datetime for feature extraction
    # Using a non-leap year as reference
    reference_year = 2001
    df['temp_date'] = pd.to_datetime(
        df['DISCOVERY_DOY'].astype(int).astype(str) + f'-{reference_year}', 
        format='%j-%Y',
        errors='coerce'
    )
    
    # Handle invalid dates
    invalid_dates = df['temp_date'].isna().sum()
    if invalid_dates > 0:
        print(f"  Warning: {invalid_dates} invalid day of year values")
        # Fill with median day
        median_doy = df['DISCOVERY_DOY'].median()
        df.loc[df['temp_date'].isna(), 'temp_date'] = pd.to_datetime(
            f'{int(median_doy)}-{reference_year}', format='%j-%Y'
        )
    
    # Extract features
    df['month'] = df['temp_date'].dt.month
    df['day_of_week'] = df['temp_date'].dt.dayofweek  # 0=Monday, 6=Sunday
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Season (1=Winter, 2=Spring, 3=Summer, 4=Fall)
    df['season'] = df['month'].apply(lambda m: 
        1 if m in [12, 1, 2] else
        2 if m in [3, 4, 5] else
        3 if m in [6, 7, 8] else 4
    )
    
    # Fire season indicator (peak fire months: June-October)
    df['is_fire_season'] = df['month'].isin([6, 7, 8, 9, 10]).astype(int)
    
    # Drop temporary date column
    df = df.drop(columns=['temp_date'])
    
    print("  Created: month, day_of_week, is_weekend, season, is_fire_season")
    
    return df


def create_geospatial_features(train_df: pd.DataFrame, test_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, KMeans]:
    """Create geospatial features from coordinates."""
    print("\nCreating geospatial features...")
    
    # 1. Latitude/Longitude bins
    print("  Creating coordinate bins...")
    
    # Define bin edges based on continental US bounds
    lat_min, lat_max = 24.0, 50.0
    lon_min, lon_max = -125.0, -66.0
    
    lat_edges = np.linspace(lat_min, lat_max, LAT_BINS + 1)
    lon_edges = np.linspace(lon_min, lon_max, LON_BINS + 1)
    
    for df in [train_df, test_df]:
        df['lat_bin'] = pd.cut(df['LATITUDE'], bins=lat_edges, labels=False, include_lowest=True)
        df['lon_bin'] = pd.cut(df['LONGITUDE'], bins=lon_edges, labels=False, include_lowest=True)
        
        # Fill NaN bins (locations outside continental US) with nearest bin
        df['lat_bin'] = df['lat_bin'].fillna(df['lat_bin'].median()).astype(int)
        df['lon_bin'] = df['lon_bin'].fillna(df['lon_bin'].median()).astype(int)
    
    # 2. Geographic clustering using K-Means
    print(f"  Fitting K-Means clustering (k={N_GEO_CLUSTERS})...")
    
    # Prepare coordinates for clustering
    train_coords = train_df[['LATITUDE', 'LONGITUDE']].values
    test_coords = test_df[['LATITUDE', 'LONGITUDE']].values
    
    # Scale coordinates
    scaler = StandardScaler()
    train_coords_scaled = scaler.fit_transform(train_coords)
    test_coords_scaled = scaler.transform(test_coords)
    
    # Fit K-Means on train data
    kmeans = KMeans(n_clusters=N_GEO_CLUSTERS, random_state=RANDOM_STATE, n_init=10)
    train_df['geo_cluster'] = kmeans.fit_predict(train_coords_scaled)
    test_df['geo_cluster'] = kmeans.predict(test_coords_scaled)
    
    print(f"  Cluster distribution (train):")
    cluster_dist = train_df['geo_cluster'].value_counts().sort_index()
    for cluster, count in cluster_dist.items():
        pct = count / len(train_df) * 100
        if pct >= 3:  # Only show clusters with >= 3%
            print(f"    Cluster {cluster}: {count:,} ({pct:.1f}%)")
    
    # 3. Coordinate interactions
    print("  Creating coordinate interactions...")
    
    for df in [train_df, test_df]:
        # Quadratic terms (captures non-linear patterns)
        df['lat_squared'] = df['LATITUDE'] ** 2
        df['lon_squared'] = df['LONGITUDE'] ** 2
        df['lat_lon_interaction'] = df['LATITUDE'] * df['LONGITUDE']
        
        # Distance from geographic center of continental US
        # Approximate center: 39.8°N, 98.6°W
        center_lat, center_lon = 39.8, -98.6
        df['dist_from_center'] = np.sqrt(
            (df['LATITUDE'] - center_lat) ** 2 + 
            (df['LONGITUDE'] - center_lon) ** 2
        )
    
    print("  Created: lat_bin, lon_bin, geo_cluster, lat_squared, lon_squared, lat_lon_interaction, dist_from_center")
    
    return train_df, test_df, kmeans


def create_cyclical_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create cyclical encoding for periodic features."""
    print("\nCreating cyclical features...")
    
    # Cyclical encoding for month (captures January-December continuity)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # Cyclical encoding for day of year
    df['doy_sin'] = np.sin(2 * np.pi * df['DISCOVERY_DOY'] / 365)
    df['doy_cos'] = np.cos(2 * np.pi * df['DISCOVERY_DOY'] / 365)
    
    # Cyclical encoding for day of week
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    print("  Created: month_sin/cos, doy_sin/cos, dow_sin/cos")
    
    return df


def create_year_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create year-based features."""
    print("\nCreating year features...")
    
    # Normalized year (0-1 scale for 1992-2015)
    min_year, max_year = 1992, 2015
    df['year_normalized'] = (df['FIRE_YEAR'] - min_year) / (max_year - min_year)
    
    # Years since start
    df['years_since_1992'] = df['FIRE_YEAR'] - min_year
    
    print("  Created: year_normalized, years_since_1992")
    
    return df


def get_feature_columns(df: pd.DataFrame) -> list:
    """Get list of feature columns for modeling."""
    # Exclude target, original categorical text columns, and intermediate columns
    exclude_cols = [
        TARGET_COLUMN,
        'NWCG_REPORTING_AGENCY', 'STAT_CAUSE_DESCR', 'STATE', 'OWNER_DESCR',
        'COUNTY'  # If present
    ]
    
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    return feature_cols


def save_data(train_df: pd.DataFrame, test_df: pd.DataFrame) -> None:
    """Save feature-engineered data."""
    print("\nSaving feature-engineered data...")
    
    # Overwrite train/test files with new features
    train_df.to_parquet(TRAIN_PARQUET, index=False)
    test_df.to_parquet(TEST_PARQUET, index=False)
    
    print(f"  Train data: {TRAIN_PARQUET}")
    print(f"  Test data: {TEST_PARQUET}")
    
    # Also save combined for reference
    combined = pd.concat([train_df, test_df], ignore_index=True)
    combined.to_parquet(FEATURES_PARQUET, index=False)
    print(f"  Combined data: {FEATURES_PARQUET}")


def print_summary(train_df: pd.DataFrame) -> None:
    """Print feature engineering summary."""
    print("\n" + "="*60)
    print("FEATURE ENGINEERING SUMMARY")
    print("="*60)
    
    feature_cols = get_feature_columns(train_df)
    
    print(f"\nTotal features: {len(feature_cols)}")
    print("\nFeature list:")
    
    # Group features by type
    temporal = [c for c in feature_cols if c in ['month', 'day_of_week', 'is_weekend', 'season', 'is_fire_season', 
                                                   'month_sin', 'month_cos', 'doy_sin', 'doy_cos', 'dow_sin', 'dow_cos']]
    geospatial = [c for c in feature_cols if c in ['lat_bin', 'lon_bin', 'geo_cluster', 'lat_squared', 'lon_squared',
                                                    'lat_lon_interaction', 'dist_from_center', 'LATITUDE', 'LONGITUDE']]
    year_feats = [c for c in feature_cols if c in ['FIRE_YEAR', 'year_normalized', 'years_since_1992', 'DISCOVERY_DOY']]
    encoded = [c for c in feature_cols if c.endswith('_encoded')]
    
    print(f"\n  Temporal ({len(temporal)}): {temporal}")
    print(f"\n  Geospatial ({len(geospatial)}): {geospatial}")
    print(f"\n  Year-based ({len(year_feats)}): {year_feats}")
    print(f"\n  Encoded categorical ({len(encoded)}): {encoded}")


def main():
    """Main feature engineering pipeline."""
    print("\n" + "="*60)
    print("FEATURE ENGINEERING")
    print("="*60)
    
    # Load data
    train_df, test_df = load_data()
    
    # Create temporal features
    train_df = create_temporal_features(train_df)
    test_df = create_temporal_features(test_df)
    
    # Create geospatial features
    train_df, test_df, kmeans = create_geospatial_features(train_df, test_df)
    
    # Create cyclical features
    train_df = create_cyclical_features(train_df)
    test_df = create_cyclical_features(test_df)
    
    # Create year features
    train_df = create_year_features(train_df)
    test_df = create_year_features(test_df)
    
    # Save data
    save_data(train_df, test_df)
    
    # Print summary
    print_summary(train_df)
    
    print("\n" + "="*60)
    print("✓ Feature Engineering Complete!")
    print("="*60 + "\n")


if __name__ == "__main__":
    main()