File size: 12,301 Bytes
64e892b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
"""

Script 05: Model Training



This script trains the ordinal classification model:

- Uses LightGBM for multi-class ordinal classification

- Implements class weighting for imbalanced data

- Performs cross-validation

- Includes hyperparameter tuning with Optuna

- Saves the trained model



Ordinal Classification Approach:

Since fire size classes have a natural order (Small < Medium < Large),

we use ordinal-aware training with cumulative link model concepts.



Usage:

    python scripts/05_train_model.py [--tune]

"""

import argparse
import json
import sys
from pathlib import Path

import joblib
import lightgbm as lgb
import numpy as np
import optuna
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
    cohen_kappa_score,
    f1_score,
)
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

from config.config import (
    TRAIN_PARQUET,
    TEST_PARQUET,
    MODELS_DIR,
    TARGET_COLUMN,
    TARGET_CLASS_NAMES,
    LIGHTGBM_PARAMS,
    OPTUNA_SEARCH_SPACE,
    N_OPTUNA_TRIALS,
    N_FOLDS,
    RANDOM_STATE,
    USE_CLASS_WEIGHTS,
    PRIMARY_METRIC
)


def load_data() -> tuple[pd.DataFrame, pd.DataFrame]:
    """Load train and test data."""
    print("Loading data...")
    train_df = pd.read_parquet(TRAIN_PARQUET)
    test_df = pd.read_parquet(TEST_PARQUET)
    print(f"  Train: {len(train_df):,} rows")
    print(f"  Test: {len(test_df):,} rows")
    return train_df, test_df


def get_feature_columns(df: pd.DataFrame) -> list:
    """Get list of feature columns for modeling."""
    exclude_cols = [
        TARGET_COLUMN,
        'NWCG_REPORTING_AGENCY', 'STAT_CAUSE_DESCR', 'STATE', 'OWNER_DESCR',
        'COUNTY'
    ]
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    return feature_cols


def prepare_data(train_df: pd.DataFrame, test_df: pd.DataFrame) -> tuple:
    """Prepare features and targets for training."""
    print("\nPreparing data...")
    
    feature_cols = get_feature_columns(train_df)
    
    X_train = train_df[feature_cols].values
    y_train = train_df[TARGET_COLUMN].values
    X_test = test_df[feature_cols].values
    y_test = test_df[TARGET_COLUMN].values
    
    print(f"  Features: {len(feature_cols)}")
    print(f"  Feature columns: {feature_cols}")
    
    return X_train, y_train, X_test, y_test, feature_cols


def compute_weights(y_train: np.ndarray) -> np.ndarray:
    """Compute sample weights for class imbalance."""
    print("\nComputing class weights...")
    
    classes = np.unique(y_train)
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=classes,
        y=y_train
    )
    
    weight_dict = dict(zip(classes, class_weights))
    print(f"  Class weights: {weight_dict}")
    
    # Create sample weights array
    sample_weights = np.array([weight_dict[y] for y in y_train])
    
    return sample_weights


def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray, prefix: str = "") -> dict:
    """Evaluate model predictions."""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'macro_f1': f1_score(y_true, y_pred, average='macro'),
        'weighted_f1': f1_score(y_true, y_pred, average='weighted'),
        'cohen_kappa': cohen_kappa_score(y_true, y_pred, weights='linear')  # Linear weights for ordinal
    }
    
    if prefix:
        print(f"\n{prefix} Metrics:")
    else:
        print("\nMetrics:")
    
    for name, value in metrics.items():
        print(f"  {name}: {value:.4f}")
    
    return metrics


def cross_validate(X: np.ndarray, y: np.ndarray, params: dict, 

                   sample_weights: np.ndarray = None) -> tuple[float, float]:
    """Perform cross-validation and return mean and std of primary metric."""
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_fold_train, X_fold_val = X[train_idx], X[val_idx]
        y_fold_train, y_fold_val = y[train_idx], y[val_idx]
        
        if sample_weights is not None:
            weights_fold = sample_weights[train_idx]
        else:
            weights_fold = None
        
        # Create LightGBM datasets
        train_data = lgb.Dataset(X_fold_train, label=y_fold_train, weight=weights_fold)
        val_data = lgb.Dataset(X_fold_val, label=y_fold_val, reference=train_data)
        
        # Train model
        model = lgb.train(
            params,
            train_data,
            num_boost_round=params.get('n_estimators', 500),
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(50, verbose=False)]
        )
        
        # Predict
        y_pred = model.predict(X_fold_val)
        y_pred_class = np.argmax(y_pred, axis=1)
        
        # Score
        score = f1_score(y_fold_val, y_pred_class, average='macro')
        scores.append(score)
    
    return np.mean(scores), np.std(scores)


def objective(trial: optuna.Trial, X: np.ndarray, y: np.ndarray, 

              sample_weights: np.ndarray) -> float:
    """Optuna objective function for hyperparameter tuning."""
    
    params = LIGHTGBM_PARAMS.copy()
    
    # Sample hyperparameters
    params['n_estimators'] = trial.suggest_int('n_estimators', *OPTUNA_SEARCH_SPACE['n_estimators'])
    params['max_depth'] = trial.suggest_int('max_depth', *OPTUNA_SEARCH_SPACE['max_depth'])
    params['learning_rate'] = trial.suggest_float('learning_rate', *OPTUNA_SEARCH_SPACE['learning_rate'], log=True)
    params['num_leaves'] = trial.suggest_int('num_leaves', *OPTUNA_SEARCH_SPACE['num_leaves'])
    params['min_child_samples'] = trial.suggest_int('min_child_samples', *OPTUNA_SEARCH_SPACE['min_child_samples'])
    params['subsample'] = trial.suggest_float('subsample', *OPTUNA_SEARCH_SPACE['subsample'])
    params['colsample_bytree'] = trial.suggest_float('colsample_bytree', *OPTUNA_SEARCH_SPACE['colsample_bytree'])
    params['reg_alpha'] = trial.suggest_float('reg_alpha', *OPTUNA_SEARCH_SPACE['reg_alpha'])
    params['reg_lambda'] = trial.suggest_float('reg_lambda', *OPTUNA_SEARCH_SPACE['reg_lambda'])
    
    # Cross-validate
    mean_score, _ = cross_validate(X, y, params, sample_weights)
    
    return mean_score


def tune_hyperparameters(X: np.ndarray, y: np.ndarray, 

                         sample_weights: np.ndarray) -> dict:
    """Tune hyperparameters using Optuna."""
    print("\n" + "="*60)
    print("HYPERPARAMETER TUNING")
    print("="*60)
    
    print(f"\nRunning {N_OPTUNA_TRIALS} Optuna trials...")
    
    # Create study
    study = optuna.create_study(
        direction='maximize',
        sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE)
    )
    
    # Optimize
    study.optimize(
        lambda trial: objective(trial, X, y, sample_weights),
        n_trials=N_OPTUNA_TRIALS,
        show_progress_bar=True
    )
    
    print(f"\nBest trial:")
    print(f"  Value (macro F1): {study.best_trial.value:.4f}")
    print(f"  Params: {study.best_trial.params}")
    
    # Merge best params with base params
    best_params = LIGHTGBM_PARAMS.copy()
    best_params.update(study.best_trial.params)
    
    best_params_path = MODELS_DIR / 'best_params.json'
    with open(best_params_path, 'w') as f:
        json.dump(study.best_trial.params, f)
    print(f"  Best params saved: {best_params_path}")

    return best_params


def train_final_model(X_train: np.ndarray, y_train: np.ndarray,

                      X_test: np.ndarray, y_test: np.ndarray,

                      params: dict, sample_weights: np.ndarray,

                      feature_names: list) -> lgb.Booster:
    """Train final model on full training data."""
    print("\n" + "="*60)
    print("TRAINING FINAL MODEL")
    print("="*60)
    
    # Create datasets
    train_data = lgb.Dataset(X_train, label=y_train, weight=sample_weights,
                             feature_name=feature_names)
    val_data = lgb.Dataset(X_test, label=y_test, reference=train_data,
                           feature_name=feature_names)
    
    # Train
    print("\nTraining...")
    model = lgb.train(
        params,
        train_data,
        num_boost_round=params.get('n_estimators', 2000),
        valid_sets=[train_data, val_data],
        valid_names=['train', 'test'],
        callbacks=[
            lgb.early_stopping(50, verbose=True),
            lgb.log_evaluation(period=50)
        ]
    )
    
    # Evaluate
    print("\n" + "-"*40)
    
    # Train predictions
    y_train_pred = np.argmax(model.predict(X_train), axis=1)
    evaluate_model(y_train, y_train_pred, "Train")
    
    # Test predictions
    y_test_pred = np.argmax(model.predict(X_test), axis=1)
    test_metrics = evaluate_model(y_test, y_test_pred, "Test")
    
    # Classification report
    print("\nClassification Report (Test):")
    print(classification_report(y_test, y_test_pred, target_names=TARGET_CLASS_NAMES))
    
    return model, test_metrics


def save_model(model: lgb.Booster, params: dict, feature_names: list, metrics: dict) -> None:
    """Save trained model and metadata."""
    print("\nSaving model...")
    
    MODELS_DIR.mkdir(parents=True, exist_ok=True)
    
    # Save LightGBM model
    model_path = MODELS_DIR / 'wildfire_model.txt'
    model.save_model(str(model_path))
    print(f"  Model: {model_path}")
    
    # Save metadata
    metadata = {
        'params': params,
        'feature_names': feature_names,
        'metrics': metrics,
        'target_classes': TARGET_CLASS_NAMES
    }
    metadata_path = MODELS_DIR / 'model_metadata.joblib'
    joblib.dump(metadata, metadata_path)
    print(f"  Metadata: {metadata_path}")


def main():
    """Main training pipeline."""
    # Parse arguments
    parser = argparse.ArgumentParser(description='Train wildfire classification model')
    parser.add_argument('--tune', action='store_true', help='Run hyperparameter tuning')
    args = parser.parse_args()
    
    print("\n" + "="*60)
    print("MODEL TRAINING")
    print("="*60)
    
    # Load data
    train_df, test_df = load_data()
    
    # Prepare data
    X_train, y_train, X_test, y_test, feature_cols = prepare_data(train_df, test_df)
    
    # Compute class weights
    sample_weights = None
    if USE_CLASS_WEIGHTS:
        sample_weights = compute_weights(y_train)
    
    # Get parameters
    if args.tune:
        params = tune_hyperparameters(X_train, y_train, sample_weights)
    else:
        best_params_path = MODELS_DIR / 'best_params.json'
        if best_params_path.exists():
            # Load saved best params
            with open(best_params_path, 'r') as f:
                tuned_params = json.load(f)
            params = LIGHTGBM_PARAMS.copy()
            params.update(tuned_params)
            print(f"Loaded best params from {best_params_path}")
        else:
            # Fallback to defaults
            params = LIGHTGBM_PARAMS.copy()
            params['n_estimators'] = 500
            params['max_depth'] = 8
            params['learning_rate'] = 0.05
            params['num_leaves'] = 64
            params['min_child_samples'] = 50
            params['subsample'] = 0.8
            params['colsample_bytree'] = 0.8
            print("No saved params found; using defaults")
    
    # Train final model
    model, metrics = train_final_model(
        X_train, y_train, X_test, y_test,
        params, sample_weights, feature_cols
    )
    
    # Save model
    save_model(model, params, feature_cols, metrics)
    
    print("\n" + "="*60)
    print("✓ Training Complete!")
    print("="*60 + "\n")


if __name__ == "__main__":
    main()