|
|
"""
|
|
|
Script 05: Model Training
|
|
|
|
|
|
This script trains the ordinal classification model:
|
|
|
- Uses LightGBM for multi-class ordinal classification
|
|
|
- Implements class weighting for imbalanced data
|
|
|
- Performs cross-validation
|
|
|
- Includes hyperparameter tuning with Optuna
|
|
|
- Saves the trained model
|
|
|
|
|
|
Ordinal Classification Approach:
|
|
|
Since fire size classes have a natural order (Small < Medium < Large),
|
|
|
we use ordinal-aware training with cumulative link model concepts.
|
|
|
|
|
|
Usage:
|
|
|
python scripts/05_train_model.py [--tune]
|
|
|
"""
|
|
|
|
|
|
import argparse
|
|
|
import json
|
|
|
import sys
|
|
|
from pathlib import Path
|
|
|
|
|
|
import joblib
|
|
|
import lightgbm as lgb
|
|
|
import numpy as np
|
|
|
import optuna
|
|
|
import pandas as pd
|
|
|
from sklearn.metrics import (
|
|
|
accuracy_score,
|
|
|
balanced_accuracy_score,
|
|
|
classification_report,
|
|
|
cohen_kappa_score,
|
|
|
f1_score,
|
|
|
)
|
|
|
from sklearn.model_selection import StratifiedKFold
|
|
|
from sklearn.utils.class_weight import compute_class_weight
|
|
|
|
|
|
|
|
|
project_root = Path(__file__).parent.parent
|
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
|
|
from config.config import (
|
|
|
TRAIN_PARQUET,
|
|
|
TEST_PARQUET,
|
|
|
MODELS_DIR,
|
|
|
TARGET_COLUMN,
|
|
|
TARGET_CLASS_NAMES,
|
|
|
LIGHTGBM_PARAMS,
|
|
|
OPTUNA_SEARCH_SPACE,
|
|
|
N_OPTUNA_TRIALS,
|
|
|
N_FOLDS,
|
|
|
RANDOM_STATE,
|
|
|
USE_CLASS_WEIGHTS,
|
|
|
PRIMARY_METRIC
|
|
|
)
|
|
|
|
|
|
|
|
|
def load_data() -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
|
"""Load train and test data."""
|
|
|
print("Loading data...")
|
|
|
train_df = pd.read_parquet(TRAIN_PARQUET)
|
|
|
test_df = pd.read_parquet(TEST_PARQUET)
|
|
|
print(f" Train: {len(train_df):,} rows")
|
|
|
print(f" Test: {len(test_df):,} rows")
|
|
|
return train_df, test_df
|
|
|
|
|
|
|
|
|
def get_feature_columns(df: pd.DataFrame) -> list:
|
|
|
"""Get list of feature columns for modeling."""
|
|
|
exclude_cols = [
|
|
|
TARGET_COLUMN,
|
|
|
'NWCG_REPORTING_AGENCY', 'STAT_CAUSE_DESCR', 'STATE', 'OWNER_DESCR',
|
|
|
'COUNTY'
|
|
|
]
|
|
|
feature_cols = [col for col in df.columns if col not in exclude_cols]
|
|
|
return feature_cols
|
|
|
|
|
|
|
|
|
def prepare_data(train_df: pd.DataFrame, test_df: pd.DataFrame) -> tuple:
|
|
|
"""Prepare features and targets for training."""
|
|
|
print("\nPreparing data...")
|
|
|
|
|
|
feature_cols = get_feature_columns(train_df)
|
|
|
|
|
|
X_train = train_df[feature_cols].values
|
|
|
y_train = train_df[TARGET_COLUMN].values
|
|
|
X_test = test_df[feature_cols].values
|
|
|
y_test = test_df[TARGET_COLUMN].values
|
|
|
|
|
|
print(f" Features: {len(feature_cols)}")
|
|
|
print(f" Feature columns: {feature_cols}")
|
|
|
|
|
|
return X_train, y_train, X_test, y_test, feature_cols
|
|
|
|
|
|
|
|
|
def compute_weights(y_train: np.ndarray) -> np.ndarray:
|
|
|
"""Compute sample weights for class imbalance."""
|
|
|
print("\nComputing class weights...")
|
|
|
|
|
|
classes = np.unique(y_train)
|
|
|
class_weights = compute_class_weight(
|
|
|
class_weight='balanced',
|
|
|
classes=classes,
|
|
|
y=y_train
|
|
|
)
|
|
|
|
|
|
weight_dict = dict(zip(classes, class_weights))
|
|
|
print(f" Class weights: {weight_dict}")
|
|
|
|
|
|
|
|
|
sample_weights = np.array([weight_dict[y] for y in y_train])
|
|
|
|
|
|
return sample_weights
|
|
|
|
|
|
|
|
|
def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray, prefix: str = "") -> dict:
|
|
|
"""Evaluate model predictions."""
|
|
|
metrics = {
|
|
|
'accuracy': accuracy_score(y_true, y_pred),
|
|
|
'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
|
|
|
'macro_f1': f1_score(y_true, y_pred, average='macro'),
|
|
|
'weighted_f1': f1_score(y_true, y_pred, average='weighted'),
|
|
|
'cohen_kappa': cohen_kappa_score(y_true, y_pred, weights='linear')
|
|
|
}
|
|
|
|
|
|
if prefix:
|
|
|
print(f"\n{prefix} Metrics:")
|
|
|
else:
|
|
|
print("\nMetrics:")
|
|
|
|
|
|
for name, value in metrics.items():
|
|
|
print(f" {name}: {value:.4f}")
|
|
|
|
|
|
return metrics
|
|
|
|
|
|
|
|
|
def cross_validate(X: np.ndarray, y: np.ndarray, params: dict,
|
|
|
sample_weights: np.ndarray = None) -> tuple[float, float]:
|
|
|
"""Perform cross-validation and return mean and std of primary metric."""
|
|
|
|
|
|
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
|
|
|
scores = []
|
|
|
|
|
|
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
|
|
|
X_fold_train, X_fold_val = X[train_idx], X[val_idx]
|
|
|
y_fold_train, y_fold_val = y[train_idx], y[val_idx]
|
|
|
|
|
|
if sample_weights is not None:
|
|
|
weights_fold = sample_weights[train_idx]
|
|
|
else:
|
|
|
weights_fold = None
|
|
|
|
|
|
|
|
|
train_data = lgb.Dataset(X_fold_train, label=y_fold_train, weight=weights_fold)
|
|
|
val_data = lgb.Dataset(X_fold_val, label=y_fold_val, reference=train_data)
|
|
|
|
|
|
|
|
|
model = lgb.train(
|
|
|
params,
|
|
|
train_data,
|
|
|
num_boost_round=params.get('n_estimators', 500),
|
|
|
valid_sets=[val_data],
|
|
|
callbacks=[lgb.early_stopping(50, verbose=False)]
|
|
|
)
|
|
|
|
|
|
|
|
|
y_pred = model.predict(X_fold_val)
|
|
|
y_pred_class = np.argmax(y_pred, axis=1)
|
|
|
|
|
|
|
|
|
score = f1_score(y_fold_val, y_pred_class, average='macro')
|
|
|
scores.append(score)
|
|
|
|
|
|
return np.mean(scores), np.std(scores)
|
|
|
|
|
|
|
|
|
def objective(trial: optuna.Trial, X: np.ndarray, y: np.ndarray,
|
|
|
sample_weights: np.ndarray) -> float:
|
|
|
"""Optuna objective function for hyperparameter tuning."""
|
|
|
|
|
|
params = LIGHTGBM_PARAMS.copy()
|
|
|
|
|
|
|
|
|
params['n_estimators'] = trial.suggest_int('n_estimators', *OPTUNA_SEARCH_SPACE['n_estimators'])
|
|
|
params['max_depth'] = trial.suggest_int('max_depth', *OPTUNA_SEARCH_SPACE['max_depth'])
|
|
|
params['learning_rate'] = trial.suggest_float('learning_rate', *OPTUNA_SEARCH_SPACE['learning_rate'], log=True)
|
|
|
params['num_leaves'] = trial.suggest_int('num_leaves', *OPTUNA_SEARCH_SPACE['num_leaves'])
|
|
|
params['min_child_samples'] = trial.suggest_int('min_child_samples', *OPTUNA_SEARCH_SPACE['min_child_samples'])
|
|
|
params['subsample'] = trial.suggest_float('subsample', *OPTUNA_SEARCH_SPACE['subsample'])
|
|
|
params['colsample_bytree'] = trial.suggest_float('colsample_bytree', *OPTUNA_SEARCH_SPACE['colsample_bytree'])
|
|
|
params['reg_alpha'] = trial.suggest_float('reg_alpha', *OPTUNA_SEARCH_SPACE['reg_alpha'])
|
|
|
params['reg_lambda'] = trial.suggest_float('reg_lambda', *OPTUNA_SEARCH_SPACE['reg_lambda'])
|
|
|
|
|
|
|
|
|
mean_score, _ = cross_validate(X, y, params, sample_weights)
|
|
|
|
|
|
return mean_score
|
|
|
|
|
|
|
|
|
def tune_hyperparameters(X: np.ndarray, y: np.ndarray,
|
|
|
sample_weights: np.ndarray) -> dict:
|
|
|
"""Tune hyperparameters using Optuna."""
|
|
|
print("\n" + "="*60)
|
|
|
print("HYPERPARAMETER TUNING")
|
|
|
print("="*60)
|
|
|
|
|
|
print(f"\nRunning {N_OPTUNA_TRIALS} Optuna trials...")
|
|
|
|
|
|
|
|
|
study = optuna.create_study(
|
|
|
direction='maximize',
|
|
|
sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE)
|
|
|
)
|
|
|
|
|
|
|
|
|
study.optimize(
|
|
|
lambda trial: objective(trial, X, y, sample_weights),
|
|
|
n_trials=N_OPTUNA_TRIALS,
|
|
|
show_progress_bar=True
|
|
|
)
|
|
|
|
|
|
print(f"\nBest trial:")
|
|
|
print(f" Value (macro F1): {study.best_trial.value:.4f}")
|
|
|
print(f" Params: {study.best_trial.params}")
|
|
|
|
|
|
|
|
|
best_params = LIGHTGBM_PARAMS.copy()
|
|
|
best_params.update(study.best_trial.params)
|
|
|
|
|
|
best_params_path = MODELS_DIR / 'best_params.json'
|
|
|
with open(best_params_path, 'w') as f:
|
|
|
json.dump(study.best_trial.params, f)
|
|
|
print(f" Best params saved: {best_params_path}")
|
|
|
|
|
|
return best_params
|
|
|
|
|
|
|
|
|
def train_final_model(X_train: np.ndarray, y_train: np.ndarray,
|
|
|
X_test: np.ndarray, y_test: np.ndarray,
|
|
|
params: dict, sample_weights: np.ndarray,
|
|
|
feature_names: list) -> lgb.Booster:
|
|
|
"""Train final model on full training data."""
|
|
|
print("\n" + "="*60)
|
|
|
print("TRAINING FINAL MODEL")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
train_data = lgb.Dataset(X_train, label=y_train, weight=sample_weights,
|
|
|
feature_name=feature_names)
|
|
|
val_data = lgb.Dataset(X_test, label=y_test, reference=train_data,
|
|
|
feature_name=feature_names)
|
|
|
|
|
|
|
|
|
print("\nTraining...")
|
|
|
model = lgb.train(
|
|
|
params,
|
|
|
train_data,
|
|
|
num_boost_round=params.get('n_estimators', 2000),
|
|
|
valid_sets=[train_data, val_data],
|
|
|
valid_names=['train', 'test'],
|
|
|
callbacks=[
|
|
|
lgb.early_stopping(50, verbose=True),
|
|
|
lgb.log_evaluation(period=50)
|
|
|
]
|
|
|
)
|
|
|
|
|
|
|
|
|
print("\n" + "-"*40)
|
|
|
|
|
|
|
|
|
y_train_pred = np.argmax(model.predict(X_train), axis=1)
|
|
|
evaluate_model(y_train, y_train_pred, "Train")
|
|
|
|
|
|
|
|
|
y_test_pred = np.argmax(model.predict(X_test), axis=1)
|
|
|
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
|
|
|
|
|
|
|
|
|
print("\nClassification Report (Test):")
|
|
|
print(classification_report(y_test, y_test_pred, target_names=TARGET_CLASS_NAMES))
|
|
|
|
|
|
return model, test_metrics
|
|
|
|
|
|
|
|
|
def save_model(model: lgb.Booster, params: dict, feature_names: list, metrics: dict) -> None:
|
|
|
"""Save trained model and metadata."""
|
|
|
print("\nSaving model...")
|
|
|
|
|
|
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
model_path = MODELS_DIR / 'wildfire_model.txt'
|
|
|
model.save_model(str(model_path))
|
|
|
print(f" Model: {model_path}")
|
|
|
|
|
|
|
|
|
metadata = {
|
|
|
'params': params,
|
|
|
'feature_names': feature_names,
|
|
|
'metrics': metrics,
|
|
|
'target_classes': TARGET_CLASS_NAMES
|
|
|
}
|
|
|
metadata_path = MODELS_DIR / 'model_metadata.joblib'
|
|
|
joblib.dump(metadata, metadata_path)
|
|
|
print(f" Metadata: {metadata_path}")
|
|
|
|
|
|
|
|
|
def main():
|
|
|
"""Main training pipeline."""
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='Train wildfire classification model')
|
|
|
parser.add_argument('--tune', action='store_true', help='Run hyperparameter tuning')
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
print("MODEL TRAINING")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
train_df, test_df = load_data()
|
|
|
|
|
|
|
|
|
X_train, y_train, X_test, y_test, feature_cols = prepare_data(train_df, test_df)
|
|
|
|
|
|
|
|
|
sample_weights = None
|
|
|
if USE_CLASS_WEIGHTS:
|
|
|
sample_weights = compute_weights(y_train)
|
|
|
|
|
|
|
|
|
if args.tune:
|
|
|
params = tune_hyperparameters(X_train, y_train, sample_weights)
|
|
|
else:
|
|
|
best_params_path = MODELS_DIR / 'best_params.json'
|
|
|
if best_params_path.exists():
|
|
|
|
|
|
with open(best_params_path, 'r') as f:
|
|
|
tuned_params = json.load(f)
|
|
|
params = LIGHTGBM_PARAMS.copy()
|
|
|
params.update(tuned_params)
|
|
|
print(f"Loaded best params from {best_params_path}")
|
|
|
else:
|
|
|
|
|
|
params = LIGHTGBM_PARAMS.copy()
|
|
|
params['n_estimators'] = 500
|
|
|
params['max_depth'] = 8
|
|
|
params['learning_rate'] = 0.05
|
|
|
params['num_leaves'] = 64
|
|
|
params['min_child_samples'] = 50
|
|
|
params['subsample'] = 0.8
|
|
|
params['colsample_bytree'] = 0.8
|
|
|
print("No saved params found; using defaults")
|
|
|
|
|
|
|
|
|
model, metrics = train_final_model(
|
|
|
X_train, y_train, X_test, y_test,
|
|
|
params, sample_weights, feature_cols
|
|
|
)
|
|
|
|
|
|
|
|
|
save_model(model, params, feature_cols, metrics)
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
print("✓ Training Complete!")
|
|
|
print("="*60 + "\n")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|