File size: 11,616 Bytes

64e892b

"""

Script 07: Prediction Pipeline



This script provides inference capabilities:

- Load trained model

- Preprocess new data

- Generate predictions with probabilities

- Can be used as a module or standalone script



Usage:

    # Single prediction

    python scripts/07_predict.py --lat 34.05 --lon -118.24 --state CA --cause "Debris Burning" --month 7

    

    # Batch prediction from CSV

    python scripts/07_predict.py --input new_fires.csv --output predictions.csv

"""

import argparse
import sys
from pathlib import Path
from typing import Optional

import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

from config.config import (
    MODELS_DIR,
    TARGET_CLASS_NAMES,
    FIRE_SIZE_CLASS_MAPPING,
    CATEGORICAL_FEATURES,
    N_GEO_CLUSTERS,
    LAT_BINS,
    LON_BINS
)


class WildfirePredictor:
    """Wildfire size class predictor."""
    
    def __init__(self, model_dir: Path = MODELS_DIR):
        """Initialize predictor with trained model."""
        self.model_dir = model_dir
        self.model = None
        self.metadata = None
        self.feature_names = None
        self.encoders = {}
        
        self._load_model()
    
    def _load_model(self) -> None:
        """Load trained model and metadata."""
        model_path = self.model_dir / 'wildfire_model.txt'
        metadata_path = self.model_dir / 'model_metadata.joblib'
        
        if not model_path.exists():
            raise FileNotFoundError(f"Model not found at {model_path}. Run training first.")
        
        self.model = lgb.Booster(model_file=str(model_path))
        self.metadata = joblib.load(metadata_path)
        self.feature_names = self.metadata['feature_names']
        
        print(f"Loaded model with {len(self.feature_names)} features")
    
    def _create_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create features for prediction."""
        df = df.copy()
        
        # Ensure required columns exist
        required = ['LATITUDE', 'LONGITUDE', 'FIRE_YEAR', 'DISCOVERY_DOY']
        for col in required:
            if col not in df.columns:
                raise ValueError(f"Missing required column: {col}")
        
        # Temporal features
        reference_year = 2001
        df['temp_date'] = pd.to_datetime(
            df['DISCOVERY_DOY'].astype(int).astype(str) + f'-{reference_year}',
            format='%j-%Y',
            errors='coerce'
        )
        
        df['month'] = df['temp_date'].dt.month
        df['day_of_week'] = df['temp_date'].dt.dayofweek
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df['season'] = df['month'].apply(lambda m:
            1 if m in [12, 1, 2] else
            2 if m in [3, 4, 5] else
            3 if m in [6, 7, 8] else 4
        )
        df['is_fire_season'] = df['month'].isin([6, 7, 8, 9, 10]).astype(int)
        
        # Cyclical features
        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
        df['doy_sin'] = np.sin(2 * np.pi * df['DISCOVERY_DOY'] / 365)
        df['doy_cos'] = np.cos(2 * np.pi * df['DISCOVERY_DOY'] / 365)
        df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
        df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
        
        # Year features
        min_year, max_year = 1992, 2015
        df['year_normalized'] = (df['FIRE_YEAR'] - min_year) / (max_year - min_year)
        df['years_since_1992'] = df['FIRE_YEAR'] - min_year
        
        # Geospatial features
        lat_min, lat_max = 24.0, 50.0
        lon_min, lon_max = -125.0, -66.0
        lat_edges = np.linspace(lat_min, lat_max, LAT_BINS + 1)
        lon_edges = np.linspace(lon_min, lon_max, LON_BINS + 1)
        
        df['lat_bin'] = pd.cut(df['LATITUDE'], bins=lat_edges, labels=False, include_lowest=True)
        df['lon_bin'] = pd.cut(df['LONGITUDE'], bins=lon_edges, labels=False, include_lowest=True)
        df['lat_bin'] = df['lat_bin'].fillna(5).astype(int)
        df['lon_bin'] = df['lon_bin'].fillna(5).astype(int)
        
        # Coordinate features
        df['lat_squared'] = df['LATITUDE'] ** 2
        df['lon_squared'] = df['LONGITUDE'] ** 2
        df['lat_lon_interaction'] = df['LATITUDE'] * df['LONGITUDE']
        
        center_lat, center_lon = 39.8, -98.6
        df['dist_from_center'] = np.sqrt(
            (df['LATITUDE'] - center_lat) ** 2 +
            (df['LONGITUDE'] - center_lon) ** 2
        )
        
        # Placeholder for geo_cluster (would need kmeans model)
        df['geo_cluster'] = 0
        
        # Drop temporary columns
        df = df.drop(columns=['temp_date'], errors='ignore')
        
        return df
    
    def _encode_categoricals(self, df: pd.DataFrame) -> pd.DataFrame:
        """Encode categorical variables."""
        df = df.copy()
        
        # Simple label encoding for inference
        # In production, would need to use same encoders as training
        for col in CATEGORICAL_FEATURES:
            encoded_col = f'{col}_encoded'
            if col in df.columns:
                # Simple hash-based encoding as fallback
                df[encoded_col] = df[col].astype(str).apply(lambda x: hash(x) % 100)
            else:
                df[encoded_col] = 0
        
        return df
    
    def preprocess(self, df: pd.DataFrame) -> np.ndarray:
        """Preprocess data for prediction."""
        df = self._create_features(df)
        df = self._encode_categoricals(df)
        
        # Select and order features to match training
        missing_features = [f for f in self.feature_names if f not in df.columns]
        if missing_features:
            print(f"Warning: Missing features (filled with 0): {missing_features}")
            for f in missing_features:
                df[f] = 0
        
        X = df[self.feature_names].values
        return X
    
    def predict(self, df: pd.DataFrame) -> pd.DataFrame:
        """Generate predictions for input data."""
        X = self.preprocess(df)
        
        # Get probabilities
        proba = self.model.predict(X)
        pred_class = np.argmax(proba, axis=1)
        
        # Create results dataframe
        results = df.copy()
        results['predicted_class'] = pred_class
        results['predicted_label'] = [TARGET_CLASS_NAMES[c] for c in pred_class]
        results['prob_small'] = proba[:, 0]
        results['prob_medium'] = proba[:, 1]
        results['prob_large'] = proba[:, 2]
        results['confidence'] = np.max(proba, axis=1)
        
        return results
    
    def predict_single(self, latitude: float, longitude: float,

                       fire_year: int, discovery_doy: int,

                       state: str = 'Unknown',

                       cause: str = 'Unknown',

                       agency: str = 'Unknown',

                       owner: str = 'Unknown') -> dict:
        """Predict for a single fire event."""
        
        df = pd.DataFrame([{
            'LATITUDE': latitude,
            'LONGITUDE': longitude,
            'FIRE_YEAR': fire_year,
            'DISCOVERY_DOY': discovery_doy,
            'STATE': state,
            'STAT_CAUSE_DESCR': cause,
            'NWCG_REPORTING_AGENCY': agency,
            'OWNER_DESCR': owner
        }])
        
        result = self.predict(df).iloc[0]
        
        return {
            'predicted_class': int(result['predicted_class']),
            'predicted_label': result['predicted_label'],
            'probabilities': {
                'Small': float(result['prob_small']),
                'Medium': float(result['prob_medium']),
                'Large': float(result['prob_large'])
            },
            'confidence': float(result['confidence'])
        }


def main():
    """Main prediction script."""
    parser = argparse.ArgumentParser(description='Wildfire size prediction')
    
    # Single prediction arguments
    parser.add_argument('--lat', type=float, help='Latitude')
    parser.add_argument('--lon', type=float, help='Longitude')
    parser.add_argument('--year', type=int, default=2015, help='Fire year')
    parser.add_argument('--doy', type=int, default=200, help='Day of year')
    parser.add_argument('--state', type=str, default='Unknown', help='State code')
    parser.add_argument('--cause', type=str, default='Unknown', help='Fire cause')
    
    # Batch prediction arguments
    parser.add_argument('--input', type=str, help='Input CSV file for batch prediction')
    parser.add_argument('--output', type=str, help='Output CSV file for predictions')
    
    args = parser.parse_args()
    
    # Initialize predictor
    predictor = WildfirePredictor()
    
    if args.input:
        # Batch prediction
        print(f"\nProcessing batch predictions from: {args.input}")
        df = pd.read_csv(args.input)
        results = predictor.predict(df)
        
        output_path = args.output or 'predictions.csv'
        results.to_csv(output_path, index=False)
        print(f"Predictions saved to: {output_path}")
        
    elif args.lat is not None and args.lon is not None:
        # Single prediction
        print("\n" + "="*60)
        print("SINGLE FIRE PREDICTION")
        print("="*60)
        
        result = predictor.predict_single(
            latitude=args.lat,
            longitude=args.lon,
            fire_year=args.year,
            discovery_doy=args.doy,
            state=args.state,
            cause=args.cause
        )
        
        print(f"\nInput:")
        print(f"  Location: ({args.lat}, {args.lon})")
        print(f"  Year: {args.year}, Day of Year: {args.doy}")
        print(f"  State: {args.state}, Cause: {args.cause}")
        
        print(f"\nPrediction:")
        print(f"  Class: {result['predicted_class']} ({result['predicted_label']})")
        print(f"  Confidence: {result['confidence']:.1%}")
        
        print(f"\nProbabilities:")
        for label, prob in result['probabilities'].items():
            bar = '█' * int(prob * 20)
            print(f"  {label:>6}: {prob:>6.1%} {bar}")
        
    else:
        # Demo prediction
        print("\n" + "="*60)
        print("DEMO PREDICTION")
        print("="*60)
        
        # Example: Summer fire in California
        result = predictor.predict_single(
            latitude=34.05,
            longitude=-118.24,
            fire_year=2015,
            discovery_doy=200,  # Mid-July
            state='CA',
            cause='Debris Burning'
        )
        
        print("\nExample: Summer fire in Los Angeles area")
        print(f"  Predicted: {result['predicted_label']} (confidence: {result['confidence']:.1%})")
        print(f"  Probabilities: Small={result['probabilities']['Small']:.1%}, "
              f"Medium={result['probabilities']['Medium']:.1%}, "
              f"Large={result['probabilities']['Large']:.1%}")
        
        print("\nUsage:")
        print("  Single: python 07_predict.py --lat 34.05 --lon -118.24 --state CA --cause 'Lightning'")
        print("  Batch:  python 07_predict.py --input fires.csv --output predictions.csv")


if __name__ == "__main__":
    main()