wildfirez / scripts /05_train_model.py
zakaneki's picture
first commit
64e892b verified
"""
Script 05: Model Training
This script trains the ordinal classification model:
- Uses LightGBM for multi-class ordinal classification
- Implements class weighting for imbalanced data
- Performs cross-validation
- Includes hyperparameter tuning with Optuna
- Saves the trained model
Ordinal Classification Approach:
Since fire size classes have a natural order (Small < Medium < Large),
we use ordinal-aware training with cumulative link model concepts.
Usage:
python scripts/05_train_model.py [--tune]
"""
import argparse
import json
import sys
from pathlib import Path
import joblib
import lightgbm as lgb
import numpy as np
import optuna
import pandas as pd
from sklearn.metrics import (
accuracy_score,
balanced_accuracy_score,
classification_report,
cohen_kappa_score,
f1_score,
)
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
from config.config import (
TRAIN_PARQUET,
TEST_PARQUET,
MODELS_DIR,
TARGET_COLUMN,
TARGET_CLASS_NAMES,
LIGHTGBM_PARAMS,
OPTUNA_SEARCH_SPACE,
N_OPTUNA_TRIALS,
N_FOLDS,
RANDOM_STATE,
USE_CLASS_WEIGHTS,
PRIMARY_METRIC
)
def load_data() -> tuple[pd.DataFrame, pd.DataFrame]:
"""Load train and test data."""
print("Loading data...")
train_df = pd.read_parquet(TRAIN_PARQUET)
test_df = pd.read_parquet(TEST_PARQUET)
print(f" Train: {len(train_df):,} rows")
print(f" Test: {len(test_df):,} rows")
return train_df, test_df
def get_feature_columns(df: pd.DataFrame) -> list:
"""Get list of feature columns for modeling."""
exclude_cols = [
TARGET_COLUMN,
'NWCG_REPORTING_AGENCY', 'STAT_CAUSE_DESCR', 'STATE', 'OWNER_DESCR',
'COUNTY'
]
feature_cols = [col for col in df.columns if col not in exclude_cols]
return feature_cols
def prepare_data(train_df: pd.DataFrame, test_df: pd.DataFrame) -> tuple:
"""Prepare features and targets for training."""
print("\nPreparing data...")
feature_cols = get_feature_columns(train_df)
X_train = train_df[feature_cols].values
y_train = train_df[TARGET_COLUMN].values
X_test = test_df[feature_cols].values
y_test = test_df[TARGET_COLUMN].values
print(f" Features: {len(feature_cols)}")
print(f" Feature columns: {feature_cols}")
return X_train, y_train, X_test, y_test, feature_cols
def compute_weights(y_train: np.ndarray) -> np.ndarray:
"""Compute sample weights for class imbalance."""
print("\nComputing class weights...")
classes = np.unique(y_train)
class_weights = compute_class_weight(
class_weight='balanced',
classes=classes,
y=y_train
)
weight_dict = dict(zip(classes, class_weights))
print(f" Class weights: {weight_dict}")
# Create sample weights array
sample_weights = np.array([weight_dict[y] for y in y_train])
return sample_weights
def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray, prefix: str = "") -> dict:
"""Evaluate model predictions."""
metrics = {
'accuracy': accuracy_score(y_true, y_pred),
'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
'macro_f1': f1_score(y_true, y_pred, average='macro'),
'weighted_f1': f1_score(y_true, y_pred, average='weighted'),
'cohen_kappa': cohen_kappa_score(y_true, y_pred, weights='linear') # Linear weights for ordinal
}
if prefix:
print(f"\n{prefix} Metrics:")
else:
print("\nMetrics:")
for name, value in metrics.items():
print(f" {name}: {value:.4f}")
return metrics
def cross_validate(X: np.ndarray, y: np.ndarray, params: dict,
sample_weights: np.ndarray = None) -> tuple[float, float]:
"""Perform cross-validation and return mean and std of primary metric."""
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
scores = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
X_fold_train, X_fold_val = X[train_idx], X[val_idx]
y_fold_train, y_fold_val = y[train_idx], y[val_idx]
if sample_weights is not None:
weights_fold = sample_weights[train_idx]
else:
weights_fold = None
# Create LightGBM datasets
train_data = lgb.Dataset(X_fold_train, label=y_fold_train, weight=weights_fold)
val_data = lgb.Dataset(X_fold_val, label=y_fold_val, reference=train_data)
# Train model
model = lgb.train(
params,
train_data,
num_boost_round=params.get('n_estimators', 500),
valid_sets=[val_data],
callbacks=[lgb.early_stopping(50, verbose=False)]
)
# Predict
y_pred = model.predict(X_fold_val)
y_pred_class = np.argmax(y_pred, axis=1)
# Score
score = f1_score(y_fold_val, y_pred_class, average='macro')
scores.append(score)
return np.mean(scores), np.std(scores)
def objective(trial: optuna.Trial, X: np.ndarray, y: np.ndarray,
sample_weights: np.ndarray) -> float:
"""Optuna objective function for hyperparameter tuning."""
params = LIGHTGBM_PARAMS.copy()
# Sample hyperparameters
params['n_estimators'] = trial.suggest_int('n_estimators', *OPTUNA_SEARCH_SPACE['n_estimators'])
params['max_depth'] = trial.suggest_int('max_depth', *OPTUNA_SEARCH_SPACE['max_depth'])
params['learning_rate'] = trial.suggest_float('learning_rate', *OPTUNA_SEARCH_SPACE['learning_rate'], log=True)
params['num_leaves'] = trial.suggest_int('num_leaves', *OPTUNA_SEARCH_SPACE['num_leaves'])
params['min_child_samples'] = trial.suggest_int('min_child_samples', *OPTUNA_SEARCH_SPACE['min_child_samples'])
params['subsample'] = trial.suggest_float('subsample', *OPTUNA_SEARCH_SPACE['subsample'])
params['colsample_bytree'] = trial.suggest_float('colsample_bytree', *OPTUNA_SEARCH_SPACE['colsample_bytree'])
params['reg_alpha'] = trial.suggest_float('reg_alpha', *OPTUNA_SEARCH_SPACE['reg_alpha'])
params['reg_lambda'] = trial.suggest_float('reg_lambda', *OPTUNA_SEARCH_SPACE['reg_lambda'])
# Cross-validate
mean_score, _ = cross_validate(X, y, params, sample_weights)
return mean_score
def tune_hyperparameters(X: np.ndarray, y: np.ndarray,
sample_weights: np.ndarray) -> dict:
"""Tune hyperparameters using Optuna."""
print("\n" + "="*60)
print("HYPERPARAMETER TUNING")
print("="*60)
print(f"\nRunning {N_OPTUNA_TRIALS} Optuna trials...")
# Create study
study = optuna.create_study(
direction='maximize',
sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE)
)
# Optimize
study.optimize(
lambda trial: objective(trial, X, y, sample_weights),
n_trials=N_OPTUNA_TRIALS,
show_progress_bar=True
)
print(f"\nBest trial:")
print(f" Value (macro F1): {study.best_trial.value:.4f}")
print(f" Params: {study.best_trial.params}")
# Merge best params with base params
best_params = LIGHTGBM_PARAMS.copy()
best_params.update(study.best_trial.params)
best_params_path = MODELS_DIR / 'best_params.json'
with open(best_params_path, 'w') as f:
json.dump(study.best_trial.params, f)
print(f" Best params saved: {best_params_path}")
return best_params
def train_final_model(X_train: np.ndarray, y_train: np.ndarray,
X_test: np.ndarray, y_test: np.ndarray,
params: dict, sample_weights: np.ndarray,
feature_names: list) -> lgb.Booster:
"""Train final model on full training data."""
print("\n" + "="*60)
print("TRAINING FINAL MODEL")
print("="*60)
# Create datasets
train_data = lgb.Dataset(X_train, label=y_train, weight=sample_weights,
feature_name=feature_names)
val_data = lgb.Dataset(X_test, label=y_test, reference=train_data,
feature_name=feature_names)
# Train
print("\nTraining...")
model = lgb.train(
params,
train_data,
num_boost_round=params.get('n_estimators', 2000),
valid_sets=[train_data, val_data],
valid_names=['train', 'test'],
callbacks=[
lgb.early_stopping(50, verbose=True),
lgb.log_evaluation(period=50)
]
)
# Evaluate
print("\n" + "-"*40)
# Train predictions
y_train_pred = np.argmax(model.predict(X_train), axis=1)
evaluate_model(y_train, y_train_pred, "Train")
# Test predictions
y_test_pred = np.argmax(model.predict(X_test), axis=1)
test_metrics = evaluate_model(y_test, y_test_pred, "Test")
# Classification report
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred, target_names=TARGET_CLASS_NAMES))
return model, test_metrics
def save_model(model: lgb.Booster, params: dict, feature_names: list, metrics: dict) -> None:
"""Save trained model and metadata."""
print("\nSaving model...")
MODELS_DIR.mkdir(parents=True, exist_ok=True)
# Save LightGBM model
model_path = MODELS_DIR / 'wildfire_model.txt'
model.save_model(str(model_path))
print(f" Model: {model_path}")
# Save metadata
metadata = {
'params': params,
'feature_names': feature_names,
'metrics': metrics,
'target_classes': TARGET_CLASS_NAMES
}
metadata_path = MODELS_DIR / 'model_metadata.joblib'
joblib.dump(metadata, metadata_path)
print(f" Metadata: {metadata_path}")
def main():
"""Main training pipeline."""
# Parse arguments
parser = argparse.ArgumentParser(description='Train wildfire classification model')
parser.add_argument('--tune', action='store_true', help='Run hyperparameter tuning')
args = parser.parse_args()
print("\n" + "="*60)
print("MODEL TRAINING")
print("="*60)
# Load data
train_df, test_df = load_data()
# Prepare data
X_train, y_train, X_test, y_test, feature_cols = prepare_data(train_df, test_df)
# Compute class weights
sample_weights = None
if USE_CLASS_WEIGHTS:
sample_weights = compute_weights(y_train)
# Get parameters
if args.tune:
params = tune_hyperparameters(X_train, y_train, sample_weights)
else:
best_params_path = MODELS_DIR / 'best_params.json'
if best_params_path.exists():
# Load saved best params
with open(best_params_path, 'r') as f:
tuned_params = json.load(f)
params = LIGHTGBM_PARAMS.copy()
params.update(tuned_params)
print(f"Loaded best params from {best_params_path}")
else:
# Fallback to defaults
params = LIGHTGBM_PARAMS.copy()
params['n_estimators'] = 500
params['max_depth'] = 8
params['learning_rate'] = 0.05
params['num_leaves'] = 64
params['min_child_samples'] = 50
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8
print("No saved params found; using defaults")
# Train final model
model, metrics = train_final_model(
X_train, y_train, X_test, y_test,
params, sample_weights, feature_cols
)
# Save model
save_model(model, params, feature_cols, metrics)
print("\n" + "="*60)
print("✓ Training Complete!")
print("="*60 + "\n")
if __name__ == "__main__":
main()