zakaneki commited on Jan 4

Commit

64e892b

verified ·

1 Parent(s): bdcfc4e

first commit

Browse files

Files changed (34) hide show

.gitattributes +5 -0
.gitignore +47 -0
FPA_FOD_20170508.sqlite +3 -0
README.md +156 -3
config/__init__.py +1 -0
config/config.py +181 -0
data/processed/fires_features.parquet +3 -0
data/processed/fires_processed.parquet +3 -0
data/processed/fires_raw.parquet +3 -0
data/processed/test.parquet +3 -0
data/processed/train.parquet +3 -0
models/best_params.json +11 -0
models/model_metadata.joblib +3 -0
models/wildfire_model.txt +3 -0
reports/figures/cause_by_size.png +0 -0
reports/figures/class_distribution.png +0 -0
reports/figures/classification_metrics.png +0 -0
reports/figures/confusion_matrix.png +0 -0
reports/figures/feature_importance.csv +29 -0
reports/figures/geographic_distribution.png +3 -0
reports/figures/missing_values.png +0 -0
reports/figures/prediction_distribution.png +0 -0
reports/figures/shap_importance.png +0 -0
reports/figures/shap_importance_summary.png +3 -0
reports/figures/temporal_patterns.png +3 -0
requirements.txt +29 -0
run_pipeline.py +91 -0
scripts/01_extract_data.py +163 -0
scripts/02_eda.py +345 -0
scripts/03_preprocess.py +280 -0
scripts/04_feature_engineering.py +290 -0
scripts/05_train_model.py +370 -0
scripts/06_evaluate.py +438 -0
scripts/07_predict.py +312 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+FPA_FOD_20170508.sqlite filter=lfs diff=lfs merge=lfs -text
+models/wildfire_model.txt filter=lfs diff=lfs merge=lfs -text
+reports/figures/geographic_distribution.png filter=lfs diff=lfs merge=lfs -text
+reports/figures/shap_importance_summary.png filter=lfs diff=lfs merge=lfs -text
+reports/figures/temporal_patterns.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,47 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Virtual environments
+venv/
+env/
+.venv/
+.env/
+# IDE settings
+.vscode/
+.idea/
+*.swp
+*.swo
+# Jupyter Notebooks
+.ipynb_checkpoints/
+# Data files
+*.sqlite
+*.db
+*.parquet
+data/processed/
+data/raw/
+# Model artifacts
+models/*.txt
+models/*.joblib
+models/*.json
+models/*.pkl
+# Reports and figures
+reports/figures/*.png
+reports/figures/*.csv
+# OS files
+.DS_Store
+Thumbs.db
+# Logs
+*.log
+# Environment variables
+.env
+.env.local

FPA_FOD_20170508.sqlite ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f04b23a24989770ce05fa354662b03e597ad164ddf5b7932b8a53b46d0ed428b
+size 795785216

README.md CHANGED Viewed

@@ -1,3 +1,156 @@
----
-license: apache-2.0
----

+# Wildfire Size Classification Project
+Predicting wildfire size classes using machine learning on the FPA FOD (Fire Program Analysis Fire-Occurrence Database) containing 1.88 million US wildfire records from 1992-2015.
+## Project Overview
+This project builds an **ordinal classification model** to predict fire size categories:
+- **Small** (0-9.9 acres): Original classes A + B
+- **Medium** (10-299 acres): Original classes C + D
+- **Large** (300+ acres): Original classes E + F + G
+### Key Features
+- **Ordinal-aware classification**: Leverages the natural ordering of fire size classes
+- **Geospatial features**: Coordinate clustering, regional binning, distance metrics
+- **Temporal features**: Cyclical encoding of month/day, fire season indicators
+- **Class imbalance handling**: Balanced class weights for rare large fire events
+- **Interpretable results**: SHAP feature importance analysis
+## Project Structure
+```
+wildfires/
+├── config/
+│   ├── __init__.py            # Package init
+│   └── config.py              # Configuration settings
+├── data/
+│   └── processed/             # Processed parquet files (train/test splits)
+├── models/                    # Saved model artifacts
+│   ├── best_params.json       # Tuned hyperparameters
+│   ├── model_metadata.joblib  # Feature names and metrics
+│   └── wildfire_model.txt     # Trained LightGBM model
+├── reports/
+│   └── figures/               # Visualizations and metrics
+├── scripts/
+│   ├── 01_extract_data.py     # Extract SQLite → Parquet
+│   ├── 02_eda.py              # Exploratory data analysis
+│   ├── 03_preprocess.py       # Data preprocessing
+│   ├── 04_feature_engineering.py  # Feature creation
+│   ├── 05_train_model.py      # Model training
+│   ├── 06_evaluate.py         # Model evaluation
+│   └── 07_predict.py          # Prediction pipeline
+├── run_pipeline.py            # Run full or partial pipeline
+├── requirements.txt           # Dependencies
+├── .gitignore                 # Git ignore rules
+└── README.md
+```
+## Getting Started
+### Prerequisites
+- Python 3.9+
+- SQLite database file (`FPA_FOD_20170508.sqlite`)
+### Installation
+1. Clone/download the repository
+2. Create a virtual environment:
+   ```bash
+   python -m venv venv
+   venv\Scripts\activate  # Windows
+   # source venv/bin/activate  # Linux/Mac
+   ```
+3. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+4. Place the SQLite database file in the project root
+### Running the Pipeline
+**Using the pipeline runner (recommended):**
+```bash
+# Run full pipeline
+python run_pipeline.py
+# Skip EDA step
+python run_pipeline.py --skip-eda
+# Run with hyperparameter tuning
+python run_pipeline.py --tune
+# Resume from a specific step (1-7)
+python run_pipeline.py --from-step 5
+```
+**Or execute scripts individually:**
+```bash
+# 1. Extract data from SQLite
+python scripts/01_extract_data.py
+# 2. Exploratory data analysis (generates plots)
+python scripts/02_eda.py
+# 3. Preprocess data
+python scripts/03_preprocess.py
+# 4. Feature engineering
+python scripts/04_feature_engineering.py
+# 5. Train model (add --tune for hyperparameter tuning)
+python scripts/05_train_model.py
+# python scripts/05_train_model.py --tune  # With Optuna tuning
+# 6. Evaluate model
+python scripts/06_evaluate.py
+# 7. Make predictions
+python scripts/07_predict.py --lat 34.05 --lon -118.24 --state CA --cause "Lightning"
+```
+## Model Details
+### Features Used
+- **Temporal**: Month, day of week, season, fire season indicator (cyclically encoded)
+- **Geospatial**: Lat/lon coordinates, regional clusters (K-means), coordinate bins
+- **Categorical**: State, fire cause, reporting agency, land owner
+- **Year**: Fire year, years since 1992
+### Algorithm
+- **LightGBM** gradient boosting for multi-class classification
+- Class weights to handle imbalanced data (~90% small fires)
+- Linear weighted Cohen's Kappa for ordinal evaluation
+### Expected Performance
+- Balanced Accuracy: ~65-75%
+- Macro F1 Score: ~0.45-0.55
+- Large fire detection is challenging due to class imbalance
+## Evaluation Metrics
+For ordinal classification, we prioritize:
+- **Macro F1**: Equal importance to all classes
+- **Balanced Accuracy**: Accounts for class imbalance
+- **Linear Weighted Kappa**: Penalizes predictions far from true class
+## Output Files
+After running the pipeline:
+- `data/processed/`: Parquet files for train/test splits
+- `models/wildfire_model.txt`: Trained LightGBM model
+- `models/model_metadata.joblib`: Feature names and metrics
+- `reports/figures/`: Visualizations (confusion matrix, SHAP plots, etc.)
+## Data Source
+**Fire Program Analysis Fire-Occurrence Database (FPA FOD)**
+- 1.88 million geo-referenced wildfire records
+- Period: 1992-2015
+- 140 million acres burned
+- Source: US federal, state, and local fire organizations
+## License
+This project uses publicly available government data.

config/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .config import *

config/config.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+Configuration settings for the Wildfire Size Classification project.
+Target: Ordinal classification of fire size into 3 classes:
+    - 0 (Small): Classes A + B (0 - 9.9 acres)
+    - 1 (Medium): Classes C + D (10 - 299 acres)
+    - 2 (Large): Classes E + F + G (300+ acres)
+"""
+from pathlib import Path
+# =============================================================================
+# PATHS
+# =============================================================================
+# Project root
+PROJECT_ROOT = Path(__file__).parent.parent
+# Data paths
+DATA_DIR = PROJECT_ROOT / "data"
+RAW_DATA_DIR = DATA_DIR / "raw"
+PROCESSED_DATA_DIR = DATA_DIR / "processed"
+# SQLite database path (adjust filename if different)
+SQLITE_DB_PATH = PROJECT_ROOT / "FPA_FOD_20170508.sqlite"
+# Output paths
+MODELS_DIR = PROJECT_ROOT / "models"
+REPORTS_DIR = PROJECT_ROOT / "reports"
+FIGURES_DIR = REPORTS_DIR / "figures"
+# Processed data files
+RAW_PARQUET = PROCESSED_DATA_DIR / "fires_raw.parquet"
+PROCESSED_PARQUET = PROCESSED_DATA_DIR / "fires_processed.parquet"
+FEATURES_PARQUET = PROCESSED_DATA_DIR / "fires_features.parquet"
+TRAIN_PARQUET = PROCESSED_DATA_DIR / "train.parquet"
+TEST_PARQUET = PROCESSED_DATA_DIR / "test.parquet"
+# =============================================================================
+# TARGET VARIABLE CONFIGURATION
+# =============================================================================
+# Original fire size classes mapping to ordinal target
+# A (0-0.25 acres), B (0.26-9.9 acres) -> 0 (Small)
+# C (10-99.9 acres), D (100-299 acres) -> 1 (Medium)
+# E (300-999 acres), F (1000-4999 acres), G (5000+ acres) -> 2 (Large)
+FIRE_SIZE_CLASS_MAPPING = {
+    'A': 0, 'B': 0,  # Small
+    'C': 1, 'D': 1,  # Medium
+    'E': 2, 'F': 2, 'G': 2  # Large
+}
+TARGET_CLASS_NAMES = ['Small', 'Medium', 'Large']
+TARGET_COLUMN = 'fire_size_ordinal'
+ORIGINAL_TARGET_COLUMN = 'FIRE_SIZE_CLASS'
+# =============================================================================
+# FEATURE CONFIGURATION
+# =============================================================================
+# Columns to drop (IDs, redundant info, text fields)
+COLUMNS_TO_DROP = [
+    'FOD_ID', 'FPA_ID', 'SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM',
+    'NWCG_REPORTING_UNIT_ID', 'NWCG_REPORTING_UNIT_NAME',
+    'SOURCE_REPORTING_UNIT', 'SOURCE_REPORTING_UNIT_NAME',
+    'LOCAL_FIRE_REPORT_ID', 'LOCAL_INCIDENT_ID',
+    'FIRE_CODE', 'FIRE_NAME',
+    'ICS_209_INCIDENT_NUMBER', 'ICS_209_NAME',
+    'MTBS_ID', 'MTBS_FIRE_NAME', 'COMPLEX_NAME',
+    'DISCOVERY_DATE', 'DISCOVERY_TIME',
+    'CONT_DATE', 'CONT_DOY', 'CONT_TIME',
+    'FIPS_CODE', 'FIPS_NAME',
+    'FIRE_SIZE',  # Don't use actual size as feature - it's what we're predicting
+    'FIRE_SIZE_CLASS',  # Original target
+    'Shape'  # Geometry column if present
+]
+# Categorical features to encode
+CATEGORICAL_FEATURES = [
+    'NWCG_REPORTING_AGENCY',
+    'STAT_CAUSE_DESCR',
+    'STATE',
+    'OWNER_DESCR'
+]
+# Numerical features (after feature engineering)
+NUMERICAL_FEATURES = [
+    'LATITUDE',
+    'LONGITUDE',
+    'DISCOVERY_DOY',
+    'FIRE_YEAR'
+]
+# Temporal features to create
+TEMPORAL_FEATURES = [
+    'month',
+    'season',
+    'day_of_week',
+    'is_weekend'
+]
+# Geospatial features to create
+GEOSPATIAL_FEATURES = [
+    'lat_bin',
+    'lon_bin',
+    'geo_cluster',
+    'lat_lon_interaction'
+]
+# =============================================================================
+# MODEL CONFIGURATION
+# =============================================================================
+# Random seed for reproducibility
+RANDOM_STATE = 42
+# Train/test split ratio
+TEST_SIZE = 0.2
+# Cross-validation folds
+N_FOLDS = 5
+# Class weights for imbalanced data (will be computed dynamically)
+USE_CLASS_WEIGHTS = True
+# LightGBM base parameters for ordinal classification
+LIGHTGBM_PARAMS = {
+    'objective': 'multiclass',
+    'num_class': 3,
+    'metric': 'multi_logloss',
+    'boosting_type': 'gbdt',
+    'verbosity': -1,
+    'random_state': RANDOM_STATE,
+    'n_jobs': -1
+}
+# Optuna hyperparameter search space
+OPTUNA_SEARCH_SPACE = {
+    'n_estimators': (100, 1000),
+    'max_depth': (3, 12),
+    'learning_rate': (0.01, 0.3),
+    'num_leaves': (20, 150),
+    'min_child_samples': (10, 100),
+    'subsample': (0.6, 1.0),
+    'colsample_bytree': (0.6, 1.0),
+    'reg_alpha': (0.0, 1.0),
+    'reg_lambda': (0.0, 1.0)
+}
+# Number of Optuna trials
+N_OPTUNA_TRIALS = 50
+# =============================================================================
+# GEOSPATIAL CLUSTERING CONFIGURATION
+# =============================================================================
+# Number of clusters for geographic regions
+N_GEO_CLUSTERS = 20
+# Latitude/Longitude binning
+LAT_BINS = 10
+LON_BINS = 10
+# =============================================================================
+# EVALUATION METRICS
+# =============================================================================
+# Primary metric for model selection
+PRIMARY_METRIC = 'macro_f1'
+# All metrics to compute
+EVALUATION_METRICS = [
+    'accuracy',
+    'balanced_accuracy',
+    'macro_f1',
+    'weighted_f1',
+    'cohen_kappa',
+    'macro_precision',
+    'macro_recall'
+]

data/processed/fires_features.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6926ded384d61d2cd531853ce79785befc12cbb9d827ebf89e5f334aefef5c57
+size 116607705

data/processed/fires_processed.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:245d6fccc065f5981420244d0aa7a6d0cd12cbe9f89753852b12eba03978c75f
+size 26619137

data/processed/fires_raw.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:841e692f7f4044513d96c762f726a276e165a009a7f6ecb8d05547d8cee59cf0
+size 127864657

data/processed/test.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2c42ac67caff94f6882dc3f7f00b3e5a30f37d19672d4fa5bd279795622c7df
+size 23840600

data/processed/train.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb2678f4b14baf3863f32818b4030253ac731587c1f3451579dc38c1dad691d2
+size 93657151

models/best_params.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "n_estimators": 516,
+    "max_depth": 9,
+    "learning_rate": 0.21074359142553023,
+    "num_leaves": 131,
+    "min_child_samples": 70,
+    "subsample": 0.7796691167092181,
+    "colsample_bytree": 0.7018466944576152,
+    "reg_alpha": 0.52062185793523,
+    "reg_lambda": 0.3425483694161869
+}

models/model_metadata.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37b351a57637f9c738d07be623d8cfbfe9a4a996f1e958c9e63883e023527925
+size 958

models/wildfire_model.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b1a077debe29e736c3382e934aa8b6e5c7cf235e8707d680bb8664ab5317d99
+size 21722535

reports/figures/cause_by_size.png ADDED Viewed

reports/figures/class_distribution.png ADDED Viewed

reports/figures/classification_metrics.png ADDED Viewed

reports/figures/confusion_matrix.png ADDED Viewed

reports/figures/feature_importance.csv ADDED Viewed

	@@ -0,0 +1,29 @@

+feature,importance
+day_of_week,0.03474360040175998
+OWNER_DESCR_encoded,0.035002645531480546
+STATE_encoded,0.04257236407411825
+is_weekend,0.042745448253639684
+lat_squared,0.0492982200647884
+month_cos,0.04941415345320393
+dist_from_center,0.04974556334097185
+years_since_1992,0.05059611732831027
+lat_bin,0.05266350693152481
+month_sin,0.05535090135037826
+NWCG_REPORTING_AGENCY_encoded,0.06568755534754865
+dow_sin,0.06723634255462703
+doy_sin,0.06948528640895639
+year_normalized,0.07218163313544536
+month,0.07541548211445999
+lat_lon_interaction,0.08870144324496855
+doy_cos,0.09805861682638295
+lon_bin,0.0997225819136466
+LONGITUDE,0.10201343368889876
+season,0.10830839963123856
+FIRE_YEAR,0.10860962139407908
+is_fire_season,0.12894166850207692
+lon_squared,0.13777922755612584
+DISCOVERY_DOY,0.14014555480776913
+geo_cluster,0.1491778952047764
+STAT_CAUSE_DESCR_encoded,0.15889478851167435
+LATITUDE,0.17045380361507326
+dow_cos,0.23151901964397537

reports/figures/geographic_distribution.png ADDED Viewed

Git LFS Details

SHA256: 422fd6f65aec85c0498d0df22d98e82882816247ab1cbf0619e2999079fa07b9
Pointer size: 132 Bytes
Size of remote file: 1.18 MB

reports/figures/missing_values.png ADDED Viewed

reports/figures/prediction_distribution.png ADDED Viewed

reports/figures/shap_importance.png ADDED Viewed

reports/figures/shap_importance_summary.png ADDED Viewed

Git LFS Details

SHA256: 76732c25c546c08ae6a3084ccba961eab87b6bc730805b051e02d371dd4786c3
Pointer size: 131 Bytes
Size of remote file: 347 kB

reports/figures/temporal_patterns.png ADDED Viewed

Git LFS Details

SHA256: 6d0ecf15472ae0e5995a509d0343ee9a0da2e70f664002c07e0542409c661bd6
Pointer size: 131 Bytes
Size of remote file: 161 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+# Core data processing
+pandas>=2.0.0
+numpy>=1.24.0
+pyarrow>=14.0.0
+# Machine learning
+scikit-learn>=1.3.0
+lightgbm>=4.0.0
+xgboost>=2.0.0
+imbalanced-learn>=0.11.0
+# Hyperparameter tuning
+optuna>=3.4.0
+# Model interpretability
+shap>=0.43.0
+# Visualization
+matplotlib>=3.7.0
+seaborn>=0.12.0
+# Geospatial (optional clustering)
+scikit-learn  # KMeans for coordinate clustering
+# Progress bars
+tqdm>=4.66.0
+# Joblib for model persistence
+joblib>=1.3.0

run_pipeline.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+Main Pipeline Runner
+Runs the complete ML pipeline from data extraction to evaluation.
+Usage:
+    python run_pipeline.py           # Run full pipeline
+    python run_pipeline.py --skip-eda  # Skip EDA step
+    python run_pipeline.py --tune    # Include hyperparameter tuning
+"""
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+def run_script(script_name: str, extra_args: list = None) -> bool:
+    """Run a Python script and return success status."""
+    script_path = Path(__file__).parent / "scripts" / script_name
+    if not script_path.exists():
+        print(f"ERROR: Script not found: {script_path}")
+        return False
+    cmd = [sys.executable, str(script_path)]
+    if extra_args:
+        cmd.extend(extra_args)
+    print(f"\n{'='*60}")
+    print(f"Running: {script_name}")
+    print(f"{'='*60}\n")
+    result = subprocess.run(cmd, cwd=str(Path(__file__).parent))
+    if result.returncode != 0:
+        print(f"\nERROR: {script_name} failed with return code {result.returncode}")
+        return False
+    return True
+def main():
+    parser = argparse.ArgumentParser(description="Run wildfire ML pipeline")
+    parser.add_argument("--skip-eda", action="store_true", help="Skip EDA step")
+    parser.add_argument("--tune", action="store_true", help="Run hyperparameter tuning")
+    parser.add_argument("--from-step", type=int, default=1, help="Start from step number (1-7)")
+    args = parser.parse_args()
+    print("\n" + "="*60)
+    print("WILDFIRE SIZE CLASSIFICATION PIPELINE")
+    print("="*60)
+    steps = [
+        ("01_extract_data.py", []),
+        ("02_eda.py", []),
+        ("03_preprocess.py", []),
+        ("04_feature_engineering.py", []),
+        ("05_train_model.py", ["--tune"] if args.tune else []),
+        ("06_evaluate.py", []),
+    ]
+    for i, (script, extra_args) in enumerate(steps, 1):
+        if i < args.from_step:
+            print(f"\nSkipping step {i}: {script}")
+            continue
+        if args.skip_eda and "eda" in script:
+            print(f"\nSkipping EDA step: {script}")
+            continue
+        success = run_script(script, extra_args)
+        if not success:
+            print(f"\nPipeline failed at step {i}: {script}")
+            sys.exit(1)
+    print("\n" + "="*60)
+    print("✓ PIPELINE COMPLETE!")
+    print("="*60)
+    print("\nOutputs:")
+    print("  - Model: models/wildfire_model.txt")
+    print("  - Figures: reports/figures/")
+    print("  - Data: data/processed/")
+    print("\nNext steps:")
+    print("  - Review figures in reports/figures/")
+    print("  - Make predictions: python scripts/07_predict.py --lat 34.05 --lon -118.24")
+    print("="*60 + "\n")
+if __name__ == "__main__":
+    main()

scripts/01_extract_data.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+Script 01: Extract Data from SQLite Database
+This script connects to the FPA FOD SQLite database, extracts the Fires table,
+and saves it as a Parquet file for faster subsequent processing.
+Usage:
+    python scripts/01_extract_data.py
+"""
+import sqlite3
+import sys
+from pathlib import Path
+import pandas as pd
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+from config.config import (
+    SQLITE_DB_PATH,
+    PROCESSED_DATA_DIR,
+    RAW_PARQUET
+)
+def connect_to_database(db_path: Path) -> sqlite3.Connection:
+    """Connect to SQLite database."""
+    if not db_path.exists():
+        raise FileNotFoundError(
+            f"Database not found at {db_path}. "
+            "Please ensure the SQLite file is in the project root."
+        )
+    print(f"Connecting to database: {db_path}")
+    return sqlite3.connect(db_path)
+def get_table_info(conn: sqlite3.Connection) -> None:
+    """Print information about tables in the database."""
+    cursor = conn.cursor()
+    # Get list of user tables (skip SpatiaLite system tables)
+    cursor.execute("""
+        SELECT name FROM sqlite_master
+        WHERE type='table' AND name NOT LIKE 'sqlite_%'
+        AND name NOT LIKE 'spatial%'
+        AND name NOT LIKE 'virt%'
+        AND name NOT LIKE 'view%'
+        AND name NOT LIKE 'geometry%'
+    """)
+    tables = cursor.fetchall()
+    print("\n" + "="*60)
+    print("DATABASE TABLES")
+    print("="*60)
+    for table in tables:
+        table_name = table[0]
+        try:
+            cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
+            count = cursor.fetchone()[0]
+            print(f"  {table_name}: {count:,} rows")
+        except Exception as e:
+            print(f"  {table_name}: (could not read - {str(e)[:30]})")
+    print("="*60 + "\n")
+def extract_fires_table(conn: sqlite3.Connection) -> pd.DataFrame:
+    """Extract the Fires table from the database."""
+    print("Extracting Fires table...")
+    query = "SELECT * FROM Fires"
+    df = pd.read_sql_query(query, conn)
+    print(f"  Loaded {len(df):,} records")
+    print(f"  Columns: {len(df.columns)}")
+    print(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1e6:.1f} MB")
+    return df
+def save_to_parquet(df: pd.DataFrame, output_path: Path) -> None:
+    """Save DataFrame to Parquet format."""
+    # Create directory if it doesn't exist
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"\nSaving to Parquet: {output_path}")
+    df.to_parquet(output_path, index=False, compression='snappy')
+    file_size_mb = output_path.stat().st_size / 1e6
+    print(f"  File size: {file_size_mb:.1f} MB")
+def print_data_summary(df: pd.DataFrame) -> None:
+    """Print summary statistics of the extracted data."""
+    print("\n" + "="*60)
+    print("DATA SUMMARY")
+    print("="*60)
+    print(f"\nDate Range: {df['FIRE_YEAR'].min()} - {df['FIRE_YEAR'].max()}")
+    print("\nFire Size Class Distribution:")
+    size_dist = df['FIRE_SIZE_CLASS'].value_counts().sort_index()
+    for cls, count in size_dist.items():
+        pct = count / len(df) * 100
+        print(f"  Class {cls}: {count:>10,} ({pct:>5.1f}%)")
+    print("\nTop 10 States by Fire Count:")
+    state_dist = df['STATE'].value_counts().head(10)
+    for state, count in state_dist.items():
+        print(f"  {state}: {count:,}")
+    print("\nTop Causes:")
+    cause_dist = df['STAT_CAUSE_DESCR'].value_counts().head(5)
+    for cause, count in cause_dist.items():
+        pct = count / len(df) * 100
+        print(f"  {cause}: {count:,} ({pct:.1f}%)")
+    print("\nMissing Values (top 10 columns):")
+    missing = df.isnull().sum().sort_values(ascending=False).head(10)
+    for col, count in missing.items():
+        if count > 0:
+            pct = count / len(df) * 100
+            print(f"  {col}: {count:,} ({pct:.1f}%)")
+    print("="*60 + "\n")
+def main():
+    """Main extraction pipeline."""
+    print("\n" + "="*60)
+    print("WILDFIRE DATA EXTRACTION")
+    print("="*60 + "\n")
+    # Connect to database
+    conn = connect_to_database(SQLITE_DB_PATH)
+    try:
+        # Show database info
+        get_table_info(conn)
+        # Extract Fires table
+        df = extract_fires_table(conn)
+        # Print summary
+        print_data_summary(df)
+        # Save to Parquet
+        save_to_parquet(df, RAW_PARQUET)
+        print("\n✓ Data extraction complete!")
+        print(f"  Output: {RAW_PARQUET}")
+    finally:
+        conn.close()
+        print("  Database connection closed.")
+if __name__ == "__main__":
+    main()

scripts/02_eda.py ADDED Viewed

	@@ -0,0 +1,345 @@

+"""
+Script 02: Exploratory Data Analysis (EDA)
+This script performs comprehensive EDA on the wildfire dataset:
+- Class distribution analysis (original 7 classes and grouped 3 classes)
+- Geographic distribution of fires
+- Temporal patterns (yearly, monthly, seasonal)
+- Missing value analysis
+- Feature correlations
+Generates visualization plots saved to reports/figures/
+Usage:
+    python scripts/02_eda.py
+"""
+import sys
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+from config.config import (
+    RAW_PARQUET,
+    FIGURES_DIR,
+    FIRE_SIZE_CLASS_MAPPING,
+    TARGET_CLASS_NAMES
+)
+# Set style
+plt.style.use('seaborn-v0_8-whitegrid')
+sns.set_palette("husl")
+def load_data() -> pd.DataFrame:
+    """Load the raw parquet data."""
+    print("Loading data...")
+    df = pd.read_parquet(RAW_PARQUET)
+    print(f"  Loaded {len(df):,} records")
+    return df
+def analyze_class_distribution(df: pd.DataFrame) -> None:
+    """Analyze and visualize fire size class distribution."""
+    print("\n" + "="*60)
+    print("CLASS DISTRIBUTION ANALYSIS")
+    print("="*60)
+    # Original 7 classes
+    print("\nOriginal Fire Size Classes:")
+    original_dist = df['FIRE_SIZE_CLASS'].value_counts().sort_index()
+    for cls, count in original_dist.items():
+        pct = count / len(df) * 100
+        print(f"  Class {cls}: {count:>10,} ({pct:>6.2f}%)")
+    # Grouped 3 classes
+    df['fire_size_grouped'] = df['FIRE_SIZE_CLASS'].map(FIRE_SIZE_CLASS_MAPPING)
+    print("\nGrouped Classes (Target Variable):")
+    grouped_dist = df['fire_size_grouped'].value_counts().sort_index()
+    for cls_idx, count in grouped_dist.items():
+        pct = count / len(df) * 100
+        cls_name = TARGET_CLASS_NAMES[cls_idx]
+        print(f"  {cls_idx} ({cls_name:>6}): {count:>10,} ({pct:>6.2f}%)")
+    # Visualize
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+    # Original distribution
+    colors_orig = sns.color_palette("YlOrRd", 7)
+    ax1 = axes[0]
+    original_dist.plot(kind='bar', ax=ax1, color=colors_orig, edgecolor='black')
+    ax1.set_title('Original Fire Size Class Distribution', fontsize=14, fontweight='bold')
+    ax1.set_xlabel('Fire Size Class')
+    ax1.set_ylabel('Count')
+    ax1.tick_params(axis='x', rotation=0)
+    # Add percentage labels
+    for i, (idx, val) in enumerate(original_dist.items()):
+        pct = val / len(df) * 100
+        ax1.annotate(f'{pct:.1f}%', (i, val), ha='center', va='bottom', fontsize=9)
+    # Grouped distribution
+    colors_grouped = ['#2ecc71', '#f39c12', '#e74c3c']  # Green, Orange, Red
+    ax2 = axes[1]
+    grouped_dist.plot(kind='bar', ax=ax2, color=colors_grouped, edgecolor='black')
+    ax2.set_title('Grouped Fire Size Distribution (Target)', fontsize=14, fontweight='bold')
+    ax2.set_xlabel('Fire Size Category')
+    ax2.set_ylabel('Count')
+    ax2.set_xticklabels(TARGET_CLASS_NAMES, rotation=0)
+    # Add percentage labels
+    for i, (idx, val) in enumerate(grouped_dist.items()):
+        pct = val / len(df) * 100
+        ax2.annotate(f'{pct:.1f}%', (i, val), ha='center', va='bottom', fontsize=10)
+    plt.tight_layout()
+    plt.savefig(FIGURES_DIR / 'class_distribution.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"\n  Saved: class_distribution.png")
+def analyze_geographic_distribution(df: pd.DataFrame) -> None:
+    """Analyze and visualize geographic distribution of fires."""
+    print("\n" + "="*60)
+    print("GEOGRAPHIC DISTRIBUTION")
+    print("="*60)
+    # Top states
+    print("\nTop 15 States by Fire Count:")
+    state_dist = df['STATE'].value_counts().head(15)
+    for state, count in state_dist.items():
+        pct = count / len(df) * 100
+        print(f"  {state}: {count:>10,} ({pct:>5.1f}%)")
+    # Fire locations scatter plot
+    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
+    # All fires (sampled for performance)
+    sample_size = min(100000, len(df))
+    df_sample = df.sample(n=sample_size, random_state=42)
+    ax1 = axes[0]
+    scatter = ax1.scatter(
+        df_sample['LONGITUDE'],
+        df_sample['LATITUDE'],
+        c=df_sample['FIRE_SIZE_CLASS'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}),
+        cmap='YlOrRd',
+        alpha=0.3,
+        s=1
+    )
+    ax1.set_title(f'Fire Locations (n={sample_size:,} sample)', fontsize=14, fontweight='bold')
+    ax1.set_xlabel('Longitude')
+    ax1.set_ylabel('Latitude')
+    ax1.set_xlim(-130, -65)
+    ax1.set_ylim(24, 50)
+    plt.colorbar(scatter, ax=ax1, label='Fire Size Class (A=0 to G=6)')
+    # Large fires only (E, F, G)
+    df_large = df[df['FIRE_SIZE_CLASS'].isin(['E', 'F', 'G'])]
+    ax2 = axes[1]
+    scatter2 = ax2.scatter(
+        df_large['LONGITUDE'],
+        df_large['LATITUDE'],
+        c=df_large['FIRE_SIZE_CLASS'].map({'E': 0, 'F': 1, 'G': 2}),
+        cmap='Reds',
+        alpha=0.5,
+        s=5
+    )
+    ax2.set_title(f'Large Fires Only (E/F/G, n={len(df_large):,})', fontsize=14, fontweight='bold')
+    ax2.set_xlabel('Longitude')
+    ax2.set_ylabel('Latitude')
+    ax2.set_xlim(-130, -65)
+    ax2.set_ylim(24, 50)
+    plt.tight_layout()
+    plt.savefig(FIGURES_DIR / 'geographic_distribution.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"\n  Saved: geographic_distribution.png")
+def analyze_temporal_patterns(df: pd.DataFrame) -> None:
+    """Analyze temporal patterns in the data."""
+    print("\n" + "="*60)
+    print("TEMPORAL PATTERNS")
+    print("="*60)
+    # Convert discovery day of year to month
+    df['month'] = pd.to_datetime(df['DISCOVERY_DOY'], format='%j').dt.month
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    # Yearly trend
+    ax1 = axes[0, 0]
+    yearly = df.groupby('FIRE_YEAR').size()
+    yearly.plot(kind='line', ax=ax1, marker='o', linewidth=2, markersize=4)
+    ax1.set_title('Fires per Year', fontsize=12, fontweight='bold')
+    ax1.set_xlabel('Year')
+    ax1.set_ylabel('Number of Fires')
+    ax1.grid(True, alpha=0.3)
+    # Monthly distribution
+    ax2 = axes[0, 1]
+    monthly = df.groupby('month').size()
+    monthly.plot(kind='bar', ax=ax2, color='coral', edgecolor='black')
+    ax2.set_title('Fires by Month', fontsize=12, fontweight='bold')
+    ax2.set_xlabel('Month')
+    ax2.set_ylabel('Number of Fires')
+    ax2.tick_params(axis='x', rotation=0)
+    # Large fires by month
+    ax3 = axes[1, 0]
+    df['fire_size_grouped'] = df['FIRE_SIZE_CLASS'].map(FIRE_SIZE_CLASS_MAPPING)
+    monthly_by_class = df.groupby(['month', 'fire_size_grouped']).size().unstack(fill_value=0)
+    monthly_by_class.columns = TARGET_CLASS_NAMES
+    monthly_by_class.plot(kind='bar', ax=ax3, width=0.8,
+                          color=['#2ecc71', '#f39c12', '#e74c3c'], edgecolor='black')
+    ax3.set_title('Fire Size Category by Month', fontsize=12, fontweight='bold')
+    ax3.set_xlabel('Month')
+    ax3.set_ylabel('Number of Fires')
+    ax3.tick_params(axis='x', rotation=0)
+    ax3.legend(title='Size Category')
+    # Fire causes
+    ax4 = axes[1, 1]
+    cause_dist = df['STAT_CAUSE_DESCR'].value_counts().head(10)
+    cause_dist.plot(kind='barh', ax=ax4, color='steelblue', edgecolor='black')
+    ax4.set_title('Top 10 Fire Causes', fontsize=12, fontweight='bold')
+    ax4.set_xlabel('Number of Fires')
+    ax4.invert_yaxis()
+    plt.tight_layout()
+    plt.savefig(FIGURES_DIR / 'temporal_patterns.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"\n  Saved: temporal_patterns.png")
+    # Print monthly stats
+    print("\nFires by Month:")
+    month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
+                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+    for month, count in monthly.items():
+        pct = count / len(df) * 100
+        print(f"  {month_names[month-1]}: {count:>10,} ({pct:>5.1f}%)")
+def analyze_missing_values(df: pd.DataFrame) -> None:
+    """Analyze missing values in the dataset."""
+    print("\n" + "="*60)
+    print("MISSING VALUE ANALYSIS")
+    print("="*60)
+    missing = df.isnull().sum()
+    missing_pct = (missing / len(df) * 100).round(2)
+    missing_df = pd.DataFrame({
+        'Missing Count': missing,
+        'Missing %': missing_pct
+    }).sort_values('Missing Count', ascending=False)
+    # Only show columns with missing values
+    missing_df = missing_df[missing_df['Missing Count'] > 0]
+    print(f"\nColumns with missing values: {len(missing_df)}")
+    print("\nTop 20 columns with missing values:")
+    for col, row in missing_df.head(20).iterrows():
+        print(f"  {col}: {row['Missing Count']:,} ({row['Missing %']:.1f}%)")
+    # Visualize
+    if len(missing_df) > 0:
+        fig, ax = plt.subplots(figsize=(12, 8))
+        missing_df.head(20)['Missing %'].plot(
+            kind='barh', ax=ax, color='salmon', edgecolor='black'
+        )
+        ax.set_title('Missing Values by Column (Top 20)', fontsize=14, fontweight='bold')
+        ax.set_xlabel('Missing %')
+        ax.invert_yaxis()
+        plt.tight_layout()
+        plt.savefig(FIGURES_DIR / 'missing_values.png', dpi=150, bbox_inches='tight')
+        plt.close()
+        print(f"\n  Saved: missing_values.png")
+def analyze_cause_by_size(df: pd.DataFrame) -> None:
+    """Analyze fire causes by fire size category."""
+    print("\n" + "="*60)
+    print("FIRE CAUSE BY SIZE ANALYSIS")
+    print("="*60)
+    df['fire_size_grouped'] = df['FIRE_SIZE_CLASS'].map(FIRE_SIZE_CLASS_MAPPING)
+    # Cross-tabulation
+    cause_size = pd.crosstab(
+        df['STAT_CAUSE_DESCR'],
+        df['fire_size_grouped'],
+        normalize='index'
+    ) * 100
+    cause_size.columns = TARGET_CLASS_NAMES
+    print("\nFire Cause Distribution by Size Category (% of each cause):")
+    print(cause_size.round(1).to_string())
+    # Visualize
+    fig, ax = plt.subplots(figsize=(12, 8))
+    cause_size.plot(kind='barh', ax=ax, stacked=True,
+                    color=['#2ecc71', '#f39c12', '#e74c3c'], edgecolor='white')
+    ax.set_title('Fire Size Distribution by Cause', fontsize=14, fontweight='bold')
+    ax.set_xlabel('Percentage')
+    ax.legend(title='Size Category', loc='lower right')
+    ax.invert_yaxis()
+    plt.tight_layout()
+    plt.savefig(FIGURES_DIR / 'cause_by_size.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"\n  Saved: cause_by_size.png")
+def analyze_owner_distribution(df: pd.DataFrame) -> None:
+    """Analyze land owner distribution."""
+    print("\n" + "="*60)
+    print("LAND OWNER ANALYSIS")
+    print("="*60)
+    owner_dist = df['OWNER_DESCR'].value_counts()
+    print("\nFires by Land Owner:")
+    for owner, count in owner_dist.head(10).items():
+        pct = count / len(df) * 100
+        print(f"  {owner}: {count:,} ({pct:.1f}%)")
+def main():
+    """Main EDA pipeline."""
+    print("\n" + "="*60)
+    print("EXPLORATORY DATA ANALYSIS")
+    print("="*60)
+    # Create figures directory
+    FIGURES_DIR.mkdir(parents=True, exist_ok=True)
+    # Load data
+    df = load_data()
+    # Run analyses
+    analyze_class_distribution(df)
+    analyze_geographic_distribution(df)
+    analyze_temporal_patterns(df)
+    analyze_missing_values(df)
+    analyze_cause_by_size(df)
+    analyze_owner_distribution(df)
+    print("\n" + "="*60)
+    print("✓ EDA Complete!")
+    print(f"  Figures saved to: {FIGURES_DIR}")
+    print("="*60 + "\n")
+if __name__ == "__main__":
+    main()

scripts/03_preprocess.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""
+Script 03: Data Preprocessing
+This script preprocesses the raw wildfire data:
+- Creates ordinal target variable (3 classes: Small, Medium, Large)
+- Drops irrelevant columns (IDs, text fields, redundant info)
+- Handles missing values
+- Encodes categorical variables
+- Splits data into train/test sets (stratified)
+Usage:
+    python scripts/03_preprocess.py
+"""
+import sys
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+from config.config import (
+    RAW_PARQUET,
+    PROCESSED_PARQUET,
+    TRAIN_PARQUET,
+    TEST_PARQUET,
+    PROCESSED_DATA_DIR,
+    FIRE_SIZE_CLASS_MAPPING,
+    TARGET_CLASS_NAMES,
+    TARGET_COLUMN,
+    COLUMNS_TO_DROP,
+    CATEGORICAL_FEATURES,
+    RANDOM_STATE,
+    TEST_SIZE
+)
+def load_data() -> pd.DataFrame:
+    """Load the raw parquet data."""
+    print("Loading raw data...")
+    df = pd.read_parquet(RAW_PARQUET)
+    print(f"  Loaded {len(df):,} records with {len(df.columns)} columns")
+    return df
+def create_target_variable(df: pd.DataFrame) -> pd.DataFrame:
+    """Create ordinal target variable from FIRE_SIZE_CLASS."""
+    print("\nCreating ordinal target variable...")
+    # Map original classes to ordinal (0, 1, 2)
+    df[TARGET_COLUMN] = df['FIRE_SIZE_CLASS'].map(FIRE_SIZE_CLASS_MAPPING)
+    # Check for unmapped values
+    unmapped = df[TARGET_COLUMN].isna().sum()
+    if unmapped > 0:
+        print(f"  Warning: {unmapped} records could not be mapped. Dropping...")
+        df = df.dropna(subset=[TARGET_COLUMN])
+    df[TARGET_COLUMN] = df[TARGET_COLUMN].astype(int)
+    # Print distribution
+    print("\n  Target Variable Distribution:")
+    for val in sorted(df[TARGET_COLUMN].unique()):
+        count = (df[TARGET_COLUMN] == val).sum()
+        pct = count / len(df) * 100
+        print(f"    {val} ({TARGET_CLASS_NAMES[val]}): {count:,} ({pct:.2f}%)")
+    return df
+def drop_irrelevant_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """Drop columns not useful for prediction."""
+    print("\nDropping irrelevant columns...")
+    # Get columns that exist in the dataframe
+    cols_to_drop = [col for col in COLUMNS_TO_DROP if col in df.columns]
+    print(f"  Dropping {len(cols_to_drop)} columns:")
+    for col in cols_to_drop[:10]:
+        print(f"    - {col}")
+    if len(cols_to_drop) > 10:
+        print(f"    ... and {len(cols_to_drop) - 10} more")
+    df = df.drop(columns=cols_to_drop, errors='ignore')
+    print(f"  Remaining columns: {len(df.columns)}")
+    return df
+def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
+    """Handle missing values in the dataset."""
+    print("\nHandling missing values...")
+    initial_rows = len(df)
+    # Check missing in essential columns
+    essential_cols = ['LATITUDE', 'LONGITUDE', 'FIRE_YEAR', 'DISCOVERY_DOY', TARGET_COLUMN]
+    for col in essential_cols:
+        if col in df.columns:
+            missing = df[col].isna().sum()
+            if missing > 0:
+                print(f"  {col}: {missing} missing values")
+    # Drop rows with missing essential values
+    df = df.dropna(subset=[c for c in essential_cols if c in df.columns])
+    # For categorical features, fill with 'Unknown'
+    for col in CATEGORICAL_FEATURES:
+        if col in df.columns:
+            missing = df[col].isna().sum()
+            if missing > 0:
+                df[col] = df[col].fillna('Unknown')
+                print(f"  {col}: Filled {missing} missing with 'Unknown'")
+    rows_dropped = initial_rows - len(df)
+    print(f"\n  Rows dropped due to missing essential values: {rows_dropped:,}")
+    print(f"  Remaining rows: {len(df):,}")
+    return df
+def encode_categorical_features(df: pd.DataFrame) -> tuple[pd.DataFrame, dict]:
+    """Encode categorical features using Label Encoding."""
+    print("\nEncoding categorical features...")
+    encoders = {}
+    for col in CATEGORICAL_FEATURES:
+        if col in df.columns:
+            le = LabelEncoder()
+            df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
+            encoders[col] = le
+            n_categories = len(le.classes_)
+            print(f"  {col}: {n_categories} categories")
+    return df, encoders
+def select_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Select features for modeling."""
+    print("\nSelecting features for modeling...")
+    # Features to keep
+    feature_cols = [
+        # Numerical
+        'LATITUDE', 'LONGITUDE', 'FIRE_YEAR', 'DISCOVERY_DOY',
+        # Encoded categorical
+        'NWCG_REPORTING_AGENCY_encoded',
+        'STAT_CAUSE_DESCR_encoded',
+        'STATE_encoded',
+        'OWNER_DESCR_encoded',
+        # Target
+        TARGET_COLUMN
+    ]
+    # Keep only columns that exist
+    available_cols = [col for col in feature_cols if col in df.columns]
+    # Also keep original categorical columns for reference
+    original_cats = [col for col in CATEGORICAL_FEATURES if col in df.columns]
+    all_cols = available_cols + original_cats
+    all_cols = list(dict.fromkeys(all_cols))  # Remove duplicates, preserve order
+    df = df[all_cols]
+    print(f"  Selected {len(available_cols)} feature columns + target")
+    print(f"  Final columns: {list(df.columns)}")
+    return df
+def split_data(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Split data into train and test sets."""
+    print("\nSplitting data into train/test sets...")
+    train_df, test_df = train_test_split(
+        df,
+        test_size=TEST_SIZE,
+        random_state=RANDOM_STATE,
+        stratify=df[TARGET_COLUMN]
+    )
+    print(f"  Train set: {len(train_df):,} rows ({100*(1-TEST_SIZE):.0f}%)")
+    print(f"  Test set: {len(test_df):,} rows ({100*TEST_SIZE:.0f}%)")
+    # Verify stratification
+    print("\n  Target distribution in splits:")
+    for name, data in [('Train', train_df), ('Test', test_df)]:
+        dist = data[TARGET_COLUMN].value_counts(normalize=True).sort_index() * 100
+        dist_str = ", ".join([f"{TARGET_CLASS_NAMES[i]}: {v:.1f}%" for i, v in dist.items()])
+        print(f"    {name}: {dist_str}")
+    return train_df, test_df
+def save_data(df: pd.DataFrame, train_df: pd.DataFrame, test_df: pd.DataFrame) -> None:
+    """Save processed data to parquet files."""
+    print("\nSaving processed data...")
+    # Create directory if needed
+    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
+    # Save full processed data
+    df.to_parquet(PROCESSED_PARQUET, index=False)
+    print(f"  Full processed data: {PROCESSED_PARQUET}")
+    # Save train/test splits
+    train_df.to_parquet(TRAIN_PARQUET, index=False)
+    print(f"  Train data: {TRAIN_PARQUET}")
+    test_df.to_parquet(TEST_PARQUET, index=False)
+    print(f"  Test data: {TEST_PARQUET}")
+def print_summary(df: pd.DataFrame) -> None:
+    """Print preprocessing summary."""
+    print("\n" + "="*60)
+    print("PREPROCESSING SUMMARY")
+    print("="*60)
+    print(f"\nDataset shape: {df.shape}")
+    print(f"\nColumn types:")
+    print(df.dtypes.value_counts().to_string())
+    print(f"\nFeature statistics:")
+    numerical_cols = df.select_dtypes(include=[np.number]).columns
+    for col in numerical_cols:
+        if col != TARGET_COLUMN:
+            print(f"  {col}:")
+            print(f"    Range: [{df[col].min():.2f}, {df[col].max():.2f}]")
+            print(f"    Mean: {df[col].mean():.2f}, Std: {df[col].std():.2f}")
+def main():
+    """Main preprocessing pipeline."""
+    print("\n" + "="*60)
+    print("DATA PREPROCESSING")
+    print("="*60)
+    # Load data
+    df = load_data()
+    # Create target variable
+    df = create_target_variable(df)
+    # Drop irrelevant columns
+    df = drop_irrelevant_columns(df)
+    # Handle missing values
+    df = handle_missing_values(df)
+    # Encode categorical features
+    df, encoders = encode_categorical_features(df)
+    # Select features
+    df = select_features(df)
+    # Split data
+    train_df, test_df = split_data(df)
+    # Save data
+    save_data(df, train_df, test_df)
+    # Print summary
+    print_summary(df)
+    print("\n" + "="*60)
+    print("✓ Preprocessing Complete!")
+    print("="*60 + "\n")
+if __name__ == "__main__":
+    main()

scripts/04_feature_engineering.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""
+Script 04: Feature Engineering
+This script creates additional features for the model:
+- Temporal features (month, season, day of week)
+- Geospatial features (lat/lon bins, clustering, interactions)
+- Coordinate transformations
+Usage:
+    python scripts/04_feature_engineering.py
+"""
+import sys
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+from config.config import (
+    TRAIN_PARQUET,
+    TEST_PARQUET,
+    FEATURES_PARQUET,
+    PROCESSED_DATA_DIR,
+    TARGET_COLUMN,
+    N_GEO_CLUSTERS,
+    LAT_BINS,
+    LON_BINS,
+    RANDOM_STATE
+)
+def load_data() -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Load train and test data."""
+    print("Loading data...")
+    train_df = pd.read_parquet(TRAIN_PARQUET)
+    test_df = pd.read_parquet(TEST_PARQUET)
+    print(f"  Train: {len(train_df):,} rows")
+    print(f"  Test: {len(test_df):,} rows")
+    return train_df, test_df
+def create_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Create temporal features from DISCOVERY_DOY."""
+    print("\nCreating temporal features...")
+    # Convert day of year to datetime for feature extraction
+    # Using a non-leap year as reference
+    reference_year = 2001
+    df['temp_date'] = pd.to_datetime(
+        df['DISCOVERY_DOY'].astype(int).astype(str) + f'-{reference_year}',
+        format='%j-%Y',
+        errors='coerce'
+    )
+    # Handle invalid dates
+    invalid_dates = df['temp_date'].isna().sum()
+    if invalid_dates > 0:
+        print(f"  Warning: {invalid_dates} invalid day of year values")
+        # Fill with median day
+        median_doy = df['DISCOVERY_DOY'].median()
+        df.loc[df['temp_date'].isna(), 'temp_date'] = pd.to_datetime(
+            f'{int(median_doy)}-{reference_year}', format='%j-%Y'
+        )
+    # Extract features
+    df['month'] = df['temp_date'].dt.month
+    df['day_of_week'] = df['temp_date'].dt.dayofweek  # 0=Monday, 6=Sunday
+    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
+    # Season (1=Winter, 2=Spring, 3=Summer, 4=Fall)
+    df['season'] = df['month'].apply(lambda m:
+        1 if m in [12, 1, 2] else
+        2 if m in [3, 4, 5] else
+        3 if m in [6, 7, 8] else 4
+    )
+    # Fire season indicator (peak fire months: June-October)
+    df['is_fire_season'] = df['month'].isin([6, 7, 8, 9, 10]).astype(int)
+    # Drop temporary date column
+    df = df.drop(columns=['temp_date'])
+    print("  Created: month, day_of_week, is_weekend, season, is_fire_season")
+    return df
+def create_geospatial_features(train_df: pd.DataFrame, test_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, KMeans]:
+    """Create geospatial features from coordinates."""
+    print("\nCreating geospatial features...")
+    # 1. Latitude/Longitude bins
+    print("  Creating coordinate bins...")
+    # Define bin edges based on continental US bounds
+    lat_min, lat_max = 24.0, 50.0
+    lon_min, lon_max = -125.0, -66.0
+    lat_edges = np.linspace(lat_min, lat_max, LAT_BINS + 1)
+    lon_edges = np.linspace(lon_min, lon_max, LON_BINS + 1)
+    for df in [train_df, test_df]:
+        df['lat_bin'] = pd.cut(df['LATITUDE'], bins=lat_edges, labels=False, include_lowest=True)
+        df['lon_bin'] = pd.cut(df['LONGITUDE'], bins=lon_edges, labels=False, include_lowest=True)
+        # Fill NaN bins (locations outside continental US) with nearest bin
+        df['lat_bin'] = df['lat_bin'].fillna(df['lat_bin'].median()).astype(int)
+        df['lon_bin'] = df['lon_bin'].fillna(df['lon_bin'].median()).astype(int)
+    # 2. Geographic clustering using K-Means
+    print(f"  Fitting K-Means clustering (k={N_GEO_CLUSTERS})...")
+    # Prepare coordinates for clustering
+    train_coords = train_df[['LATITUDE', 'LONGITUDE']].values
+    test_coords = test_df[['LATITUDE', 'LONGITUDE']].values
+    # Scale coordinates
+    scaler = StandardScaler()
+    train_coords_scaled = scaler.fit_transform(train_coords)
+    test_coords_scaled = scaler.transform(test_coords)
+    # Fit K-Means on train data
+    kmeans = KMeans(n_clusters=N_GEO_CLUSTERS, random_state=RANDOM_STATE, n_init=10)
+    train_df['geo_cluster'] = kmeans.fit_predict(train_coords_scaled)
+    test_df['geo_cluster'] = kmeans.predict(test_coords_scaled)
+    print(f"  Cluster distribution (train):")
+    cluster_dist = train_df['geo_cluster'].value_counts().sort_index()
+    for cluster, count in cluster_dist.items():
+        pct = count / len(train_df) * 100
+        if pct >= 3:  # Only show clusters with >= 3%
+            print(f"    Cluster {cluster}: {count:,} ({pct:.1f}%)")
+    # 3. Coordinate interactions
+    print("  Creating coordinate interactions...")
+    for df in [train_df, test_df]:
+        # Quadratic terms (captures non-linear patterns)
+        df['lat_squared'] = df['LATITUDE'] ** 2
+        df['lon_squared'] = df['LONGITUDE'] ** 2
+        df['lat_lon_interaction'] = df['LATITUDE'] * df['LONGITUDE']
+        # Distance from geographic center of continental US
+        # Approximate center: 39.8°N, 98.6°W
+        center_lat, center_lon = 39.8, -98.6
+        df['dist_from_center'] = np.sqrt(
+            (df['LATITUDE'] - center_lat) ** 2 +
+            (df['LONGITUDE'] - center_lon) ** 2
+        )
+    print("  Created: lat_bin, lon_bin, geo_cluster, lat_squared, lon_squared, lat_lon_interaction, dist_from_center")
+    return train_df, test_df, kmeans
+def create_cyclical_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Create cyclical encoding for periodic features."""
+    print("\nCreating cyclical features...")
+    # Cyclical encoding for month (captures January-December continuity)
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    # Cyclical encoding for day of year
+    df['doy_sin'] = np.sin(2 * np.pi * df['DISCOVERY_DOY'] / 365)
+    df['doy_cos'] = np.cos(2 * np.pi * df['DISCOVERY_DOY'] / 365)
+    # Cyclical encoding for day of week
+    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
+    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
+    print("  Created: month_sin/cos, doy_sin/cos, dow_sin/cos")
+    return df
+def create_year_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Create year-based features."""
+    print("\nCreating year features...")
+    # Normalized year (0-1 scale for 1992-2015)
+    min_year, max_year = 1992, 2015
+    df['year_normalized'] = (df['FIRE_YEAR'] - min_year) / (max_year - min_year)
+    # Years since start
+    df['years_since_1992'] = df['FIRE_YEAR'] - min_year
+    print("  Created: year_normalized, years_since_1992")
+    return df
+def get_feature_columns(df: pd.DataFrame) -> list:
+    """Get list of feature columns for modeling."""
+    # Exclude target, original categorical text columns, and intermediate columns
+    exclude_cols = [
+        TARGET_COLUMN,
+        'NWCG_REPORTING_AGENCY', 'STAT_CAUSE_DESCR', 'STATE', 'OWNER_DESCR',
+        'COUNTY'  # If present
+    ]
+    feature_cols = [col for col in df.columns if col not in exclude_cols]
+    return feature_cols
+def save_data(train_df: pd.DataFrame, test_df: pd.DataFrame) -> None:
+    """Save feature-engineered data."""
+    print("\nSaving feature-engineered data...")
+    # Overwrite train/test files with new features
+    train_df.to_parquet(TRAIN_PARQUET, index=False)
+    test_df.to_parquet(TEST_PARQUET, index=False)
+    print(f"  Train data: {TRAIN_PARQUET}")
+    print(f"  Test data: {TEST_PARQUET}")
+    # Also save combined for reference
+    combined = pd.concat([train_df, test_df], ignore_index=True)
+    combined.to_parquet(FEATURES_PARQUET, index=False)
+    print(f"  Combined data: {FEATURES_PARQUET}")
+def print_summary(train_df: pd.DataFrame) -> None:
+    """Print feature engineering summary."""
+    print("\n" + "="*60)
+    print("FEATURE ENGINEERING SUMMARY")
+    print("="*60)
+    feature_cols = get_feature_columns(train_df)
+    print(f"\nTotal features: {len(feature_cols)}")
+    print("\nFeature list:")
+    # Group features by type
+    temporal = [c for c in feature_cols if c in ['month', 'day_of_week', 'is_weekend', 'season', 'is_fire_season',
+                                                   'month_sin', 'month_cos', 'doy_sin', 'doy_cos', 'dow_sin', 'dow_cos']]
+    geospatial = [c for c in feature_cols if c in ['lat_bin', 'lon_bin', 'geo_cluster', 'lat_squared', 'lon_squared',
+                                                    'lat_lon_interaction', 'dist_from_center', 'LATITUDE', 'LONGITUDE']]
+    year_feats = [c for c in feature_cols if c in ['FIRE_YEAR', 'year_normalized', 'years_since_1992', 'DISCOVERY_DOY']]
+    encoded = [c for c in feature_cols if c.endswith('_encoded')]
+    print(f"\n  Temporal ({len(temporal)}): {temporal}")
+    print(f"\n  Geospatial ({len(geospatial)}): {geospatial}")
+    print(f"\n  Year-based ({len(year_feats)}): {year_feats}")
+    print(f"\n  Encoded categorical ({len(encoded)}): {encoded}")
+def main():
+    """Main feature engineering pipeline."""
+    print("\n" + "="*60)
+    print("FEATURE ENGINEERING")
+    print("="*60)
+    # Load data
+    train_df, test_df = load_data()
+    # Create temporal features
+    train_df = create_temporal_features(train_df)
+    test_df = create_temporal_features(test_df)
+    # Create geospatial features
+    train_df, test_df, kmeans = create_geospatial_features(train_df, test_df)
+    # Create cyclical features
+    train_df = create_cyclical_features(train_df)
+    test_df = create_cyclical_features(test_df)
+    # Create year features
+    train_df = create_year_features(train_df)
+    test_df = create_year_features(test_df)
+    # Save data
+    save_data(train_df, test_df)
+    # Print summary
+    print_summary(train_df)
+    print("\n" + "="*60)
+    print("✓ Feature Engineering Complete!")
+    print("="*60 + "\n")
+if __name__ == "__main__":
+    main()

scripts/05_train_model.py ADDED Viewed

	@@ -0,0 +1,370 @@

+"""
+Script 05: Model Training
+This script trains the ordinal classification model:
+- Uses LightGBM for multi-class ordinal classification
+- Implements class weighting for imbalanced data
+- Performs cross-validation
+- Includes hyperparameter tuning with Optuna
+- Saves the trained model
+Ordinal Classification Approach:
+Since fire size classes have a natural order (Small < Medium < Large),
+we use ordinal-aware training with cumulative link model concepts.
+Usage:
+    python scripts/05_train_model.py [--tune]
+"""
+import argparse
+import json
+import sys
+from pathlib import Path
+import joblib
+import lightgbm as lgb
+import numpy as np
+import optuna
+import pandas as pd
+from sklearn.metrics import (
+    accuracy_score,
+    balanced_accuracy_score,
+    classification_report,
+    cohen_kappa_score,
+    f1_score,
+)
+from sklearn.model_selection import StratifiedKFold
+from sklearn.utils.class_weight import compute_class_weight
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+from config.config import (
+    TRAIN_PARQUET,
+    TEST_PARQUET,
+    MODELS_DIR,
+    TARGET_COLUMN,
+    TARGET_CLASS_NAMES,
+    LIGHTGBM_PARAMS,
+    OPTUNA_SEARCH_SPACE,
+    N_OPTUNA_TRIALS,
+    N_FOLDS,
+    RANDOM_STATE,
+    USE_CLASS_WEIGHTS,
+    PRIMARY_METRIC
+)
+def load_data() -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Load train and test data."""
+    print("Loading data...")
+    train_df = pd.read_parquet(TRAIN_PARQUET)
+    test_df = pd.read_parquet(TEST_PARQUET)
+    print(f"  Train: {len(train_df):,} rows")
+    print(f"  Test: {len(test_df):,} rows")
+    return train_df, test_df
+def get_feature_columns(df: pd.DataFrame) -> list:
+    """Get list of feature columns for modeling."""
+    exclude_cols = [
+        TARGET_COLUMN,
+        'NWCG_REPORTING_AGENCY', 'STAT_CAUSE_DESCR', 'STATE', 'OWNER_DESCR',
+        'COUNTY'
+    ]
+    feature_cols = [col for col in df.columns if col not in exclude_cols]
+    return feature_cols
+def prepare_data(train_df: pd.DataFrame, test_df: pd.DataFrame) -> tuple:
+    """Prepare features and targets for training."""
+    print("\nPreparing data...")
+    feature_cols = get_feature_columns(train_df)
+    X_train = train_df[feature_cols].values
+    y_train = train_df[TARGET_COLUMN].values
+    X_test = test_df[feature_cols].values
+    y_test = test_df[TARGET_COLUMN].values
+    print(f"  Features: {len(feature_cols)}")
+    print(f"  Feature columns: {feature_cols}")
+    return X_train, y_train, X_test, y_test, feature_cols
+def compute_weights(y_train: np.ndarray) -> np.ndarray:
+    """Compute sample weights for class imbalance."""
+    print("\nComputing class weights...")
+    classes = np.unique(y_train)
+    class_weights = compute_class_weight(
+        class_weight='balanced',
+        classes=classes,
+        y=y_train
+    )
+    weight_dict = dict(zip(classes, class_weights))
+    print(f"  Class weights: {weight_dict}")
+    # Create sample weights array
+    sample_weights = np.array([weight_dict[y] for y in y_train])
+    return sample_weights
+def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray, prefix: str = "") -> dict:
+    """Evaluate model predictions."""
+    metrics = {
+        'accuracy': accuracy_score(y_true, y_pred),
+        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
+        'macro_f1': f1_score(y_true, y_pred, average='macro'),
+        'weighted_f1': f1_score(y_true, y_pred, average='weighted'),
+        'cohen_kappa': cohen_kappa_score(y_true, y_pred, weights='linear')  # Linear weights for ordinal
+    }
+    if prefix:
+        print(f"\n{prefix} Metrics:")
+    else:
+        print("\nMetrics:")
+    for name, value in metrics.items():
+        print(f"  {name}: {value:.4f}")
+    return metrics
+def cross_validate(X: np.ndarray, y: np.ndarray, params: dict,
+                   sample_weights: np.ndarray = None) -> tuple[float, float]:
+    """Perform cross-validation and return mean and std of primary metric."""
+    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
+    scores = []
+    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
+        X_fold_train, X_fold_val = X[train_idx], X[val_idx]
+        y_fold_train, y_fold_val = y[train_idx], y[val_idx]
+        if sample_weights is not None:
+            weights_fold = sample_weights[train_idx]
+        else:
+            weights_fold = None
+        # Create LightGBM datasets
+        train_data = lgb.Dataset(X_fold_train, label=y_fold_train, weight=weights_fold)
+        val_data = lgb.Dataset(X_fold_val, label=y_fold_val, reference=train_data)
+        # Train model
+        model = lgb.train(
+            params,
+            train_data,
+            num_boost_round=params.get('n_estimators', 500),
+            valid_sets=[val_data],
+            callbacks=[lgb.early_stopping(50, verbose=False)]
+        )
+        # Predict
+        y_pred = model.predict(X_fold_val)
+        y_pred_class = np.argmax(y_pred, axis=1)
+        # Score
+        score = f1_score(y_fold_val, y_pred_class, average='macro')
+        scores.append(score)
+    return np.mean(scores), np.std(scores)
+def objective(trial: optuna.Trial, X: np.ndarray, y: np.ndarray,
+              sample_weights: np.ndarray) -> float:
+    """Optuna objective function for hyperparameter tuning."""
+    params = LIGHTGBM_PARAMS.copy()
+    # Sample hyperparameters
+    params['n_estimators'] = trial.suggest_int('n_estimators', *OPTUNA_SEARCH_SPACE['n_estimators'])
+    params['max_depth'] = trial.suggest_int('max_depth', *OPTUNA_SEARCH_SPACE['max_depth'])
+    params['learning_rate'] = trial.suggest_float('learning_rate', *OPTUNA_SEARCH_SPACE['learning_rate'], log=True)
+    params['num_leaves'] = trial.suggest_int('num_leaves', *OPTUNA_SEARCH_SPACE['num_leaves'])
+    params['min_child_samples'] = trial.suggest_int('min_child_samples', *OPTUNA_SEARCH_SPACE['min_child_samples'])
+    params['subsample'] = trial.suggest_float('subsample', *OPTUNA_SEARCH_SPACE['subsample'])
+    params['colsample_bytree'] = trial.suggest_float('colsample_bytree', *OPTUNA_SEARCH_SPACE['colsample_bytree'])
+    params['reg_alpha'] = trial.suggest_float('reg_alpha', *OPTUNA_SEARCH_SPACE['reg_alpha'])
+    params['reg_lambda'] = trial.suggest_float('reg_lambda', *OPTUNA_SEARCH_SPACE['reg_lambda'])
+    # Cross-validate
+    mean_score, _ = cross_validate(X, y, params, sample_weights)
+    return mean_score
+def tune_hyperparameters(X: np.ndarray, y: np.ndarray,
+                         sample_weights: np.ndarray) -> dict:
+    """Tune hyperparameters using Optuna."""
+    print("\n" + "="*60)
+    print("HYPERPARAMETER TUNING")
+    print("="*60)
+    print(f"\nRunning {N_OPTUNA_TRIALS} Optuna trials...")
+    # Create study
+    study = optuna.create_study(
+        direction='maximize',
+        sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE)
+    )
+    # Optimize
+    study.optimize(
+        lambda trial: objective(trial, X, y, sample_weights),
+        n_trials=N_OPTUNA_TRIALS,
+        show_progress_bar=True
+    )
+    print(f"\nBest trial:")
+    print(f"  Value (macro F1): {study.best_trial.value:.4f}")
+    print(f"  Params: {study.best_trial.params}")
+    # Merge best params with base params
+    best_params = LIGHTGBM_PARAMS.copy()
+    best_params.update(study.best_trial.params)
+    best_params_path = MODELS_DIR / 'best_params.json'
+    with open(best_params_path, 'w') as f:
+        json.dump(study.best_trial.params, f)
+    print(f"  Best params saved: {best_params_path}")
+    return best_params
+def train_final_model(X_train: np.ndarray, y_train: np.ndarray,
+                      X_test: np.ndarray, y_test: np.ndarray,
+                      params: dict, sample_weights: np.ndarray,
+                      feature_names: list) -> lgb.Booster:
+    """Train final model on full training data."""
+    print("\n" + "="*60)
+    print("TRAINING FINAL MODEL")
+    print("="*60)
+    # Create datasets
+    train_data = lgb.Dataset(X_train, label=y_train, weight=sample_weights,
+                             feature_name=feature_names)
+    val_data = lgb.Dataset(X_test, label=y_test, reference=train_data,
+                           feature_name=feature_names)
+    # Train
+    print("\nTraining...")
+    model = lgb.train(
+        params,
+        train_data,
+        num_boost_round=params.get('n_estimators', 2000),
+        valid_sets=[train_data, val_data],
+        valid_names=['train', 'test'],
+        callbacks=[
+            lgb.early_stopping(50, verbose=True),
+            lgb.log_evaluation(period=50)
+        ]
+    )
+    # Evaluate
+    print("\n" + "-"*40)
+    # Train predictions
+    y_train_pred = np.argmax(model.predict(X_train), axis=1)
+    evaluate_model(y_train, y_train_pred, "Train")
+    # Test predictions
+    y_test_pred = np.argmax(model.predict(X_test), axis=1)
+    test_metrics = evaluate_model(y_test, y_test_pred, "Test")
+    # Classification report
+    print("\nClassification Report (Test):")
+    print(classification_report(y_test, y_test_pred, target_names=TARGET_CLASS_NAMES))
+    return model, test_metrics
+def save_model(model: lgb.Booster, params: dict, feature_names: list, metrics: dict) -> None:
+    """Save trained model and metadata."""
+    print("\nSaving model...")
+    MODELS_DIR.mkdir(parents=True, exist_ok=True)
+    # Save LightGBM model
+    model_path = MODELS_DIR / 'wildfire_model.txt'
+    model.save_model(str(model_path))
+    print(f"  Model: {model_path}")
+    # Save metadata
+    metadata = {
+        'params': params,
+        'feature_names': feature_names,
+        'metrics': metrics,
+        'target_classes': TARGET_CLASS_NAMES
+    }
+    metadata_path = MODELS_DIR / 'model_metadata.joblib'
+    joblib.dump(metadata, metadata_path)
+    print(f"  Metadata: {metadata_path}")
+def main():
+    """Main training pipeline."""
+    # Parse arguments
+    parser = argparse.ArgumentParser(description='Train wildfire classification model')
+    parser.add_argument('--tune', action='store_true', help='Run hyperparameter tuning')
+    args = parser.parse_args()
+    print("\n" + "="*60)
+    print("MODEL TRAINING")
+    print("="*60)
+    # Load data
+    train_df, test_df = load_data()
+    # Prepare data
+    X_train, y_train, X_test, y_test, feature_cols = prepare_data(train_df, test_df)
+    # Compute class weights
+    sample_weights = None
+    if USE_CLASS_WEIGHTS:
+        sample_weights = compute_weights(y_train)
+    # Get parameters
+    if args.tune:
+        params = tune_hyperparameters(X_train, y_train, sample_weights)
+    else:
+        best_params_path = MODELS_DIR / 'best_params.json'
+        if best_params_path.exists():
+            # Load saved best params
+            with open(best_params_path, 'r') as f:
+                tuned_params = json.load(f)
+            params = LIGHTGBM_PARAMS.copy()
+            params.update(tuned_params)
+            print(f"Loaded best params from {best_params_path}")
+        else:
+            # Fallback to defaults
+            params = LIGHTGBM_PARAMS.copy()
+            params['n_estimators'] = 500
+            params['max_depth'] = 8
+            params['learning_rate'] = 0.05
+            params['num_leaves'] = 64
+            params['min_child_samples'] = 50
+            params['subsample'] = 0.8
+            params['colsample_bytree'] = 0.8
+            print("No saved params found; using defaults")
+    # Train final model
+    model, metrics = train_final_model(
+        X_train, y_train, X_test, y_test,
+        params, sample_weights, feature_cols
+    )
+    # Save model
+    save_model(model, params, feature_cols, metrics)
+    print("\n" + "="*60)
+    print("✓ Training Complete!")
+    print("="*60 + "\n")
+if __name__ == "__main__":
+    main()

scripts/06_evaluate.py ADDED Viewed

	@@ -0,0 +1,438 @@

+"""
+Script 06: Model Evaluation
+This script performs comprehensive evaluation of the trained model:
+- Confusion matrix visualization
+- Per-class metrics analysis
+- Ordinal-specific metrics (linear weighted kappa)
+- SHAP feature importance analysis
+- Error analysis
+Usage:
+    python scripts/06_evaluate.py
+"""
+import sys
+from pathlib import Path
+import joblib
+import lightgbm as lgb
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import shap
+from sklearn.metrics import (
+    accuracy_score,
+    balanced_accuracy_score,
+    classification_report,
+    cohen_kappa_score,
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    recall_score,
+)
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+from config.config import (
+    TEST_PARQUET,
+    TRAIN_PARQUET,
+    MODELS_DIR,
+    FIGURES_DIR,
+    TARGET_COLUMN,
+    TARGET_CLASS_NAMES
+)
+# Set style
+plt.style.use('seaborn-v0_8-whitegrid')
+def load_model_and_data() -> tuple:
+    """Load trained model, metadata, and test data."""
+    print("Loading model and data...")
+    # Load model
+    model_path = MODELS_DIR / 'wildfire_model.txt'
+    model = lgb.Booster(model_file=str(model_path))
+    print(f"  Model: {model_path}")
+    # Load metadata
+    metadata_path = MODELS_DIR / 'model_metadata.joblib'
+    metadata = joblib.load(metadata_path)
+    print(f"  Metadata: {metadata_path}")
+    # Load test data
+    test_df = pd.read_parquet(TEST_PARQUET)
+    train_df = pd.read_parquet(TRAIN_PARQUET)
+    print(f"  Test data: {len(test_df):,} rows")
+    return model, metadata, train_df, test_df
+def prepare_data(df: pd.DataFrame, feature_names: list) -> tuple:
+    """Prepare features and target from dataframe."""
+    X = df[feature_names].values
+    y = df[TARGET_COLUMN].values
+    return X, y
+def compute_all_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_proba: np.ndarray) -> dict:
+    """Compute comprehensive metrics."""
+    metrics = {
+        # Standard metrics
+        'accuracy': accuracy_score(y_true, y_pred),
+        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
+        'macro_f1': f1_score(y_true, y_pred, average='macro'),
+        'weighted_f1': f1_score(y_true, y_pred, average='weighted'),
+        'macro_precision': precision_score(y_true, y_pred, average='macro'),
+        'macro_recall': recall_score(y_true, y_pred, average='macro'),
+        # Ordinal-specific: Linear weighted Cohen's Kappa
+        # Penalizes predictions farther from true class
+        'cohen_kappa_linear': cohen_kappa_score(y_true, y_pred, weights='linear'),
+        'cohen_kappa_quadratic': cohen_kappa_score(y_true, y_pred, weights='quadratic'),
+        # Per-class metrics
+        'per_class_precision': precision_score(y_true, y_pred, average=None),
+        'per_class_recall': recall_score(y_true, y_pred, average=None),
+        'per_class_f1': f1_score(y_true, y_pred, average=None)
+    }
+    return metrics
+def print_metrics(metrics: dict) -> None:
+    """Print metrics in a formatted way."""
+    print("\n" + "="*60)
+    print("EVALUATION METRICS")
+    print("="*60)
+    print("\nOverall Metrics:")
+    print(f"  Accuracy:           {metrics['accuracy']:.4f}")
+    print(f"  Balanced Accuracy:  {metrics['balanced_accuracy']:.4f}")
+    print(f"  Macro F1:           {metrics['macro_f1']:.4f}")
+    print(f"  Weighted F1:        {metrics['weighted_f1']:.4f}")
+    print(f"  Macro Precision:    {metrics['macro_precision']:.4f}")
+    print(f"  Macro Recall:       {metrics['macro_recall']:.4f}")
+    print("\nOrdinal Metrics (penalize distance from true class):")
+    print(f"  Cohen's Kappa (Linear):    {metrics['cohen_kappa_linear']:.4f}")
+    print(f"  Cohen's Kappa (Quadratic): {metrics['cohen_kappa_quadratic']:.4f}")
+    print("\nPer-Class Metrics:")
+    print(f"  {'Class':<10} {'Precision':>10} {'Recall':>10} {'F1':>10}")
+    print(f"  {'-'*40}")
+    for i, name in enumerate(TARGET_CLASS_NAMES):
+        print(f"  {name:<10} {metrics['per_class_precision'][i]:>10.4f} "
+              f"{metrics['per_class_recall'][i]:>10.4f} {metrics['per_class_f1'][i]:>10.4f}")
+def plot_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, save_path: Path) -> None:
+    """Plot and save confusion matrix."""
+    print("\nGenerating confusion matrix...")
+    cm = confusion_matrix(y_true, y_pred)
+    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+    # Raw counts
+    ax1 = axes[0]
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
+                xticklabels=TARGET_CLASS_NAMES, yticklabels=TARGET_CLASS_NAMES)
+    ax1.set_title('Confusion Matrix (Counts)', fontsize=14, fontweight='bold')
+    ax1.set_xlabel('Predicted')
+    ax1.set_ylabel('Actual')
+    # Normalized (percentages)
+    ax2 = axes[1]
+    sns.heatmap(cm_normalized, annot=True, fmt='.1%', cmap='Blues', ax=ax2,
+                xticklabels=TARGET_CLASS_NAMES, yticklabels=TARGET_CLASS_NAMES)
+    ax2.set_title('Confusion Matrix (Normalized)', fontsize=14, fontweight='bold')
+    ax2.set_xlabel('Predicted')
+    ax2.set_ylabel('Actual')
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: {save_path}")
+def plot_classification_report(y_true: np.ndarray, y_pred: np.ndarray, save_path: Path) -> None:
+    """Plot classification metrics as bar chart."""
+    print("\nGenerating classification report plot...")
+    report = classification_report(y_true, y_pred, target_names=TARGET_CLASS_NAMES, output_dict=True)
+    # Convert to DataFrame
+    df_report = pd.DataFrame(report).T
+    df_report = df_report.drop(['accuracy', 'macro avg', 'weighted avg'], errors='ignore')
+    df_report = df_report[['precision', 'recall', 'f1-score']]
+    fig, ax = plt.subplots(figsize=(10, 6))
+    x = np.arange(len(TARGET_CLASS_NAMES))
+    width = 0.25
+    bars1 = ax.bar(x - width, df_report['precision'], width, label='Precision', color='#3498db')
+    bars2 = ax.bar(x, df_report['recall'], width, label='Recall', color='#2ecc71')
+    bars3 = ax.bar(x + width, df_report['f1-score'], width, label='F1-Score', color='#e74c3c')
+    ax.set_xlabel('Fire Size Class')
+    ax.set_ylabel('Score')
+    ax.set_title('Per-Class Classification Metrics', fontsize=14, fontweight='bold')
+    ax.set_xticks(x)
+    ax.set_xticklabels(TARGET_CLASS_NAMES)
+    ax.legend()
+    ax.set_ylim(0, 1.1)
+    # Add value labels
+    for bars in [bars1, bars2, bars3]:
+        for bar in bars:
+            height = bar.get_height()
+            ax.annotate(f'{height:.2f}',
+                       xy=(bar.get_x() + bar.get_width() / 2, height),
+                       xytext=(0, 3), textcoords="offset points",
+                       ha='center', va='bottom', fontsize=8)
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: {save_path}")
+def plot_shap_importance(model: lgb.Booster, X: np.ndarray,
+                         feature_names: list, save_path: Path,
+                         max_display: int = 20) -> None:
+    """Generate SHAP feature importance plots."""
+    print("\nGenerating SHAP analysis...")
+    print(f"  X shape: {X.shape}")
+    print(f"  Number of feature names: {len(feature_names)}")
+    # Use a sample for SHAP (faster computation)
+    sample_size = min(5000, len(X))
+    np.random.seed(42)
+    sample_idx = np.random.choice(len(X), sample_size, replace=False)
+    X_sample = X[sample_idx]
+    # Create explainer
+    explainer = shap.TreeExplainer(model)
+    shap_values = explainer.shap_values(X_sample)
+    # SHAP values is a list of arrays (one per class for multiclass)
+    # Average absolute SHAP values across all classes
+    if isinstance(shap_values, list):
+        # If it's a list of arrays, each array is (samples, features)
+        # We want the mean absolute value for each feature across all samples and all classes
+        mean_shap = np.mean([np.abs(sv).mean(axis=0) for sv in shap_values], axis=0)
+    else:
+        # Handle case where shap_values is a single array (samples, features * classes)
+        # or (samples, features)
+        mean_shap = np.abs(shap_values).mean(axis=0)
+        # If we have a multiple of features, it's likely multiclass flattened
+        num_feats = len(feature_names)
+        if mean_shap.size > num_feats and mean_shap.size % num_feats == 0:
+            n_classes = mean_shap.size // num_feats
+            print(f"  Aggregating SHAP values for {n_classes} classes...")
+            mean_shap = mean_shap.reshape(n_classes, num_feats).mean(axis=0)
+    # Ensure mean_shap is 1D
+    if mean_shap.ndim > 1:
+        mean_shap = mean_shap.flatten()
+    print(f"  Mean SHAP shape: {mean_shap.shape}")
+    # Handle mismatch between feature_names and mean_shap length
+    if len(feature_names) != mean_shap.size:
+        print(f"  WARNING: Feature names ({len(feature_names)}) != SHAP values ({mean_shap.size})")
+        # Trim to match
+        n = min(len(feature_names), mean_shap.size)
+        feature_names = feature_names[:n]
+        mean_shap = mean_shap[:n]
+        print(f"  Trimmed to {n} features")
+    # Create feature importance DataFrame
+    importance_df = pd.DataFrame({
+        'feature': feature_names,
+        'importance': mean_shap
+    }).sort_values('importance', ascending=True)
+    # Plot 1: Feature Importance Bar Chart
+    plt.figure(figsize=(10, 8))
+    top_features = importance_df.tail(max_display)
+    plt.barh(top_features['feature'], top_features['importance'], color='steelblue')
+    plt.xlabel('Mean |SHAP Value|')
+    plt.title(f'Top {max_display} Feature Importance (SHAP)', fontsize=14, fontweight='bold')
+    plt.grid(axis='x', alpha=0.3)
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved importance plot: {save_path}")
+    # Plot 2: SHAP Summary Plot (Large Fires)
+    # Extract SHAP values for Large fire class (class index 2)
+    shap_values_large = None
+    num_feats = len(feature_names)
+    if isinstance(shap_values, list) and len(shap_values) > 2:
+        # Already a list of arrays per class
+        shap_values_large = shap_values[2]
+    elif isinstance(shap_values, np.ndarray):
+        # Single array - need to reshape if it's (samples, features * classes)
+        if shap_values.shape[1] == num_feats * 3:
+            # Reshape from (samples, features*classes) to (samples, classes, features)
+            # Then extract class 2 (Large fires)
+            reshaped = shap_values.reshape(shap_values.shape[0], 3, num_feats)
+            shap_values_large = reshaped[:, 2, :]  # Class 2 = Large
+            print(f"  Extracted Large fire SHAP values: {shap_values_large.shape}")
+        elif shap_values.shape[1] == num_feats:
+            # Binary or single output - use as-is
+            shap_values_large = shap_values
+    if shap_values_large is not None:
+        summary_path = save_path.parent / f"{save_path.stem}_summary{save_path.suffix}"
+        plt.figure(figsize=(10, 8))
+        try:
+            print("  Generating SHAP summary plot...")
+            shap.summary_plot(shap_values_large, X_sample, feature_names=feature_names,
+                             max_display=max_display, show=False)
+            plt.title('SHAP Summary: Large Fire Class', fontsize=14, fontweight='bold')
+            plt.tight_layout()
+            plt.savefig(summary_path, dpi=150, bbox_inches='tight')
+            print(f"  Saved summary plot: {summary_path}")
+        except Exception as e:
+            print(f"  Could not generate summary plot: {e}")
+        plt.close()
+    else:
+        print("  Skipping summary plot (could not extract Large class SHAP values)")
+    # Print top features
+    print("\n  Top 10 Most Important Features:")
+    for _, row in importance_df.tail(10).iloc[::-1].iterrows():
+        print(f"    {row['feature']}: {row['importance']:.4f}")
+    return importance_df
+def analyze_errors(test_df: pd.DataFrame, y_true: np.ndarray,
+                   y_pred: np.ndarray, save_path: Path) -> None:
+    """Analyze misclassifications."""
+    print("\nAnalyzing misclassifications...")
+    # Add predictions to dataframe
+    test_df = test_df.copy()
+    test_df['predicted'] = y_pred
+    test_df['correct'] = y_true == y_pred
+    errors = test_df[~test_df['correct']]
+    print(f"\n  Total errors: {len(errors):,} ({len(errors)/len(test_df)*100:.1f}%)")
+    # Error types
+    print("\n  Error Distribution:")
+    for true_class in range(3):
+        for pred_class in range(3):
+            if true_class != pred_class:
+                count = ((y_true == true_class) & (y_pred == pred_class)).sum()
+                if count > 0:
+                    pct = count / len(errors) * 100
+                    true_name = TARGET_CLASS_NAMES[true_class]
+                    pred_name = TARGET_CLASS_NAMES[pred_class]
+                    print(f"    {true_name} → {pred_name}: {count:,} ({pct:.1f}%)")
+    # Adjacent vs non-adjacent errors (important for ordinal)
+    adjacent_errors = 0
+    non_adjacent_errors = 0
+    for true_class, pred_class in zip(y_true[y_true != y_pred], y_pred[y_true != y_pred]):
+        if abs(true_class - pred_class) == 1:
+            adjacent_errors += 1
+        else:
+            non_adjacent_errors += 1
+    print(f"\n  Ordinal Error Analysis:")
+    print(f"    Adjacent errors (off by 1):     {adjacent_errors:,} ({adjacent_errors/len(errors)*100:.1f}%)")
+    print(f"    Non-adjacent errors (off by 2): {non_adjacent_errors:,} ({non_adjacent_errors/len(errors)*100:.1f}%)")
+def plot_prediction_distribution(y_true: np.ndarray, y_pred: np.ndarray,
+                                  y_proba: np.ndarray, save_path: Path) -> None:
+    """Plot prediction probability distributions."""
+    print("\nGenerating prediction distribution plots...")
+    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
+    for i, (ax, class_name) in enumerate(zip(axes, TARGET_CLASS_NAMES)):
+        # Get probabilities for this class
+        proba = y_proba[:, i]
+        # Split by actual class
+        for true_class in range(3):
+            mask = y_true == true_class
+            ax.hist(proba[mask], bins=50, alpha=0.5,
+                   label=f'Actual: {TARGET_CLASS_NAMES[true_class]}', density=True)
+        ax.set_xlabel(f'P({class_name})')
+        ax.set_ylabel('Density')
+        ax.set_title(f'Predicted Probability: {class_name}', fontweight='bold')
+        ax.legend(fontsize=8)
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved: {save_path}")
+def main():
+    """Main evaluation pipeline."""
+    print("\n" + "="*60)
+    print("MODEL EVALUATION")
+    print("="*60)
+    # Create figures directory
+    FIGURES_DIR.mkdir(parents=True, exist_ok=True)
+    # Load model and data
+    model, metadata, train_df, test_df = load_model_and_data()
+    feature_names = metadata['feature_names']
+    # Prepare data
+    X_test, y_test = prepare_data(test_df, feature_names)
+    X_train, y_train = prepare_data(train_df, feature_names)
+    # Make predictions
+    y_proba = model.predict(X_test)
+    y_pred = np.argmax(y_proba, axis=1)
+    # Compute metrics
+    metrics = compute_all_metrics(y_test, y_pred, y_proba)
+    print_metrics(metrics)
+    # Generate plots
+    plot_confusion_matrix(y_test, y_pred, FIGURES_DIR / 'confusion_matrix.png')
+    plot_classification_report(y_test, y_pred, FIGURES_DIR / 'classification_metrics.png')
+    plot_prediction_distribution(y_test, y_pred, y_proba, FIGURES_DIR / 'prediction_distribution.png')
+    # SHAP analysis
+    importance_df = plot_shap_importance(model, X_test, feature_names,
+                                         FIGURES_DIR / 'shap_importance.png')
+    # Error analysis
+    analyze_errors(test_df, y_test, y_pred, FIGURES_DIR / 'error_analysis.png')
+    # Save importance rankings
+    importance_df.to_csv(FIGURES_DIR / 'feature_importance.csv', index=False)
+    print("\n" + "="*60)
+    print("✓ Evaluation Complete!")
+    print(f"  Figures saved to: {FIGURES_DIR}")
+    print("="*60 + "\n")
+if __name__ == "__main__":
+    main()

scripts/07_predict.py ADDED Viewed

	@@ -0,0 +1,312 @@

+"""
+Script 07: Prediction Pipeline
+This script provides inference capabilities:
+- Load trained model
+- Preprocess new data
+- Generate predictions with probabilities
+- Can be used as a module or standalone script
+Usage:
+    # Single prediction
+    python scripts/07_predict.py --lat 34.05 --lon -118.24 --state CA --cause "Debris Burning" --month 7
+    # Batch prediction from CSV
+    python scripts/07_predict.py --input new_fires.csv --output predictions.csv
+"""
+import argparse
+import sys
+from pathlib import Path
+from typing import Optional
+import joblib
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+from config.config import (
+    MODELS_DIR,
+    TARGET_CLASS_NAMES,
+    FIRE_SIZE_CLASS_MAPPING,
+    CATEGORICAL_FEATURES,
+    N_GEO_CLUSTERS,
+    LAT_BINS,
+    LON_BINS
+)
+class WildfirePredictor:
+    """Wildfire size class predictor."""
+    def __init__(self, model_dir: Path = MODELS_DIR):
+        """Initialize predictor with trained model."""
+        self.model_dir = model_dir
+        self.model = None
+        self.metadata = None
+        self.feature_names = None
+        self.encoders = {}
+        self._load_model()
+    def _load_model(self) -> None:
+        """Load trained model and metadata."""
+        model_path = self.model_dir / 'wildfire_model.txt'
+        metadata_path = self.model_dir / 'model_metadata.joblib'
+        if not model_path.exists():
+            raise FileNotFoundError(f"Model not found at {model_path}. Run training first.")
+        self.model = lgb.Booster(model_file=str(model_path))
+        self.metadata = joblib.load(metadata_path)
+        self.feature_names = self.metadata['feature_names']
+        print(f"Loaded model with {len(self.feature_names)} features")
+    def _create_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Create features for prediction."""
+        df = df.copy()
+        # Ensure required columns exist
+        required = ['LATITUDE', 'LONGITUDE', 'FIRE_YEAR', 'DISCOVERY_DOY']
+        for col in required:
+            if col not in df.columns:
+                raise ValueError(f"Missing required column: {col}")
+        # Temporal features
+        reference_year = 2001
+        df['temp_date'] = pd.to_datetime(
+            df['DISCOVERY_DOY'].astype(int).astype(str) + f'-{reference_year}',
+            format='%j-%Y',
+            errors='coerce'
+        )
+        df['month'] = df['temp_date'].dt.month
+        df['day_of_week'] = df['temp_date'].dt.dayofweek
+        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
+        df['season'] = df['month'].apply(lambda m:
+            1 if m in [12, 1, 2] else
+            2 if m in [3, 4, 5] else
+            3 if m in [6, 7, 8] else 4
+        )
+        df['is_fire_season'] = df['month'].isin([6, 7, 8, 9, 10]).astype(int)
+        # Cyclical features
+        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+        df['doy_sin'] = np.sin(2 * np.pi * df['DISCOVERY_DOY'] / 365)
+        df['doy_cos'] = np.cos(2 * np.pi * df['DISCOVERY_DOY'] / 365)
+        df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
+        df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
+        # Year features
+        min_year, max_year = 1992, 2015
+        df['year_normalized'] = (df['FIRE_YEAR'] - min_year) / (max_year - min_year)
+        df['years_since_1992'] = df['FIRE_YEAR'] - min_year
+        # Geospatial features
+        lat_min, lat_max = 24.0, 50.0
+        lon_min, lon_max = -125.0, -66.0
+        lat_edges = np.linspace(lat_min, lat_max, LAT_BINS + 1)
+        lon_edges = np.linspace(lon_min, lon_max, LON_BINS + 1)
+        df['lat_bin'] = pd.cut(df['LATITUDE'], bins=lat_edges, labels=False, include_lowest=True)
+        df['lon_bin'] = pd.cut(df['LONGITUDE'], bins=lon_edges, labels=False, include_lowest=True)
+        df['lat_bin'] = df['lat_bin'].fillna(5).astype(int)
+        df['lon_bin'] = df['lon_bin'].fillna(5).astype(int)
+        # Coordinate features
+        df['lat_squared'] = df['LATITUDE'] ** 2
+        df['lon_squared'] = df['LONGITUDE'] ** 2
+        df['lat_lon_interaction'] = df['LATITUDE'] * df['LONGITUDE']
+        center_lat, center_lon = 39.8, -98.6
+        df['dist_from_center'] = np.sqrt(
+            (df['LATITUDE'] - center_lat) ** 2 +
+            (df['LONGITUDE'] - center_lon) ** 2
+        )
+        # Placeholder for geo_cluster (would need kmeans model)
+        df['geo_cluster'] = 0
+        # Drop temporary columns
+        df = df.drop(columns=['temp_date'], errors='ignore')
+        return df
+    def _encode_categoricals(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Encode categorical variables."""
+        df = df.copy()
+        # Simple label encoding for inference
+        # In production, would need to use same encoders as training
+        for col in CATEGORICAL_FEATURES:
+            encoded_col = f'{col}_encoded'
+            if col in df.columns:
+                # Simple hash-based encoding as fallback
+                df[encoded_col] = df[col].astype(str).apply(lambda x: hash(x) % 100)
+            else:
+                df[encoded_col] = 0
+        return df
+    def preprocess(self, df: pd.DataFrame) -> np.ndarray:
+        """Preprocess data for prediction."""
+        df = self._create_features(df)
+        df = self._encode_categoricals(df)
+        # Select and order features to match training
+        missing_features = [f for f in self.feature_names if f not in df.columns]
+        if missing_features:
+            print(f"Warning: Missing features (filled with 0): {missing_features}")
+            for f in missing_features:
+                df[f] = 0
+        X = df[self.feature_names].values
+        return X
+    def predict(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Generate predictions for input data."""
+        X = self.preprocess(df)
+        # Get probabilities
+        proba = self.model.predict(X)
+        pred_class = np.argmax(proba, axis=1)
+        # Create results dataframe
+        results = df.copy()
+        results['predicted_class'] = pred_class
+        results['predicted_label'] = [TARGET_CLASS_NAMES[c] for c in pred_class]
+        results['prob_small'] = proba[:, 0]
+        results['prob_medium'] = proba[:, 1]
+        results['prob_large'] = proba[:, 2]
+        results['confidence'] = np.max(proba, axis=1)
+        return results
+    def predict_single(self, latitude: float, longitude: float,
+                       fire_year: int, discovery_doy: int,
+                       state: str = 'Unknown',
+                       cause: str = 'Unknown',
+                       agency: str = 'Unknown',
+                       owner: str = 'Unknown') -> dict:
+        """Predict for a single fire event."""
+        df = pd.DataFrame([{
+            'LATITUDE': latitude,
+            'LONGITUDE': longitude,
+            'FIRE_YEAR': fire_year,
+            'DISCOVERY_DOY': discovery_doy,
+            'STATE': state,
+            'STAT_CAUSE_DESCR': cause,
+            'NWCG_REPORTING_AGENCY': agency,
+            'OWNER_DESCR': owner
+        }])
+        result = self.predict(df).iloc[0]
+        return {
+            'predicted_class': int(result['predicted_class']),
+            'predicted_label': result['predicted_label'],
+            'probabilities': {
+                'Small': float(result['prob_small']),
+                'Medium': float(result['prob_medium']),
+                'Large': float(result['prob_large'])
+            },
+            'confidence': float(result['confidence'])
+        }
+def main():
+    """Main prediction script."""
+    parser = argparse.ArgumentParser(description='Wildfire size prediction')
+    # Single prediction arguments
+    parser.add_argument('--lat', type=float, help='Latitude')
+    parser.add_argument('--lon', type=float, help='Longitude')
+    parser.add_argument('--year', type=int, default=2015, help='Fire year')
+    parser.add_argument('--doy', type=int, default=200, help='Day of year')
+    parser.add_argument('--state', type=str, default='Unknown', help='State code')
+    parser.add_argument('--cause', type=str, default='Unknown', help='Fire cause')
+    # Batch prediction arguments
+    parser.add_argument('--input', type=str, help='Input CSV file for batch prediction')
+    parser.add_argument('--output', type=str, help='Output CSV file for predictions')
+    args = parser.parse_args()
+    # Initialize predictor
+    predictor = WildfirePredictor()
+    if args.input:
+        # Batch prediction
+        print(f"\nProcessing batch predictions from: {args.input}")
+        df = pd.read_csv(args.input)
+        results = predictor.predict(df)
+        output_path = args.output or 'predictions.csv'
+        results.to_csv(output_path, index=False)
+        print(f"Predictions saved to: {output_path}")
+    elif args.lat is not None and args.lon is not None:
+        # Single prediction
+        print("\n" + "="*60)
+        print("SINGLE FIRE PREDICTION")
+        print("="*60)
+        result = predictor.predict_single(
+            latitude=args.lat,
+            longitude=args.lon,
+            fire_year=args.year,
+            discovery_doy=args.doy,
+            state=args.state,
+            cause=args.cause
+        )
+        print(f"\nInput:")
+        print(f"  Location: ({args.lat}, {args.lon})")
+        print(f"  Year: {args.year}, Day of Year: {args.doy}")
+        print(f"  State: {args.state}, Cause: {args.cause}")
+        print(f"\nPrediction:")
+        print(f"  Class: {result['predicted_class']} ({result['predicted_label']})")
+        print(f"  Confidence: {result['confidence']:.1%}")
+        print(f"\nProbabilities:")
+        for label, prob in result['probabilities'].items():
+            bar = '█' * int(prob * 20)
+            print(f"  {label:>6}: {prob:>6.1%} {bar}")
+    else:
+        # Demo prediction
+        print("\n" + "="*60)
+        print("DEMO PREDICTION")
+        print("="*60)
+        # Example: Summer fire in California
+        result = predictor.predict_single(
+            latitude=34.05,
+            longitude=-118.24,
+            fire_year=2015,
+            discovery_doy=200,  # Mid-July
+            state='CA',
+            cause='Debris Burning'
+        )
+        print("\nExample: Summer fire in Los Angeles area")
+        print(f"  Predicted: {result['predicted_label']} (confidence: {result['confidence']:.1%})")
+        print(f"  Probabilities: Small={result['probabilities']['Small']:.1%}, "
+              f"Medium={result['probabilities']['Medium']:.1%}, "
+              f"Large={result['probabilities']['Large']:.1%}")
+        print("\nUsage:")
+        print("  Single: python 07_predict.py --lat 34.05 --lon -118.24 --state CA --cause 'Lightning'")
+        print("  Batch:  python 07_predict.py --input fires.csv --output predictions.csv")
+if __name__ == "__main__":
+    main()