|
|
|
|
|
import os |
|
|
import json |
|
|
import joblib |
|
|
import logging |
|
|
import numpy as np |
|
|
from datetime import datetime |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.model_selection import StratifiedKFold |
|
|
from sklearn.metrics import (precision_recall_fscore_support, roc_auc_score, |
|
|
average_precision_score, confusion_matrix, precision_recall_curve) |
|
|
|
|
|
logger = logging.getLogger('nids') |
|
|
|
|
|
|
|
|
|
|
|
class BinaryLabelEncoder: |
|
|
"""Simple encoder mapping: BENIGN -> 0, anything else -> 1. |
|
|
Provides transform/inverse_transform and `classes_` compatible attribute. |
|
|
""" |
|
|
def __init__(self): |
|
|
self.classes_ = np.array([0, 1]) |
|
|
|
|
|
def transform(self, y_series): |
|
|
|
|
|
y_str = np.array(y_series).astype(str) |
|
|
return (y_str != 'BENIGN').astype(int) |
|
|
|
|
|
def inverse_transform(self, y_arr): |
|
|
y = np.array(y_arr).astype(int) |
|
|
return np.where(y == 0, 'BENIGN', 'ATTACK') |
|
|
|
|
|
|
|
|
def validate_and_select_features(df, features): |
|
|
missing = [c for c in features if c not in df.columns] |
|
|
if missing: |
|
|
raise ValueError(f"Missing feature columns: {missing}") |
|
|
X = df[features].copy() |
|
|
|
|
|
nunique = X.nunique() |
|
|
const_cols = nunique[nunique <= 1].index.tolist() |
|
|
if const_cols: |
|
|
logger.info('Dropping constant columns: %s', const_cols) |
|
|
X.drop(columns=const_cols, inplace=True) |
|
|
return X |
|
|
|
|
|
|
|
|
def train_model_cv(df, features, target='Label', n_splits=5, n_estimators=100, max_depth=None, seed=42): |
|
|
"""Train RandomForest with StratifiedKFold and return best model plus metrics. |
|
|
|
|
|
- Uses class_weight='balanced' to handle class imbalance (no SMOTE). |
|
|
- Computes precision, recall, F1, PR-AUC, ROC-AUC and confusion matrices per fold. |
|
|
""" |
|
|
|
|
|
encoder = BinaryLabelEncoder() |
|
|
y_raw = df[target].astype(str) |
|
|
y = encoder.transform(y_raw) |
|
|
X = validate_and_select_features(df, features) |
|
|
|
|
|
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) |
|
|
fold_metrics = [] |
|
|
models = [] |
|
|
|
|
|
|
|
|
all_val_probas = [] |
|
|
all_val_labels = [] |
|
|
|
|
|
X_arr = X.values |
|
|
y_arr = np.asarray(y) |
|
|
|
|
|
|
|
|
for fold, (train_idx, val_idx) in enumerate(skf.split(X_arr, y_arr), start=1): |
|
|
clf = RandomForestClassifier( |
|
|
n_estimators=n_estimators, |
|
|
max_depth=(None if max_depth == 0 else max_depth), |
|
|
class_weight='balanced', |
|
|
random_state=seed, |
|
|
n_jobs=-1, |
|
|
) |
|
|
clf.fit(X_arr[train_idx], y_arr[train_idx]) |
|
|
|
|
|
proba = clf.predict_proba(X_arr[val_idx])[:, 1] |
|
|
preds = (proba >= 0.5).astype(int) |
|
|
|
|
|
all_val_probas.extend(proba.tolist()) |
|
|
all_val_labels.extend(y_arr[val_idx].tolist()) |
|
|
|
|
|
prec, rec, f1, _ = precision_recall_fscore_support(y_arr[val_idx], preds, average='binary', zero_division=0) |
|
|
pr_auc = average_precision_score(y_arr[val_idx], proba) |
|
|
try: |
|
|
roc = roc_auc_score(y_arr[val_idx], proba) |
|
|
except Exception: |
|
|
roc = float('nan') |
|
|
|
|
|
cm = confusion_matrix(y_arr[val_idx], preds).tolist() |
|
|
fold_metrics.append({ |
|
|
'fold': fold, |
|
|
'precision': float(prec), |
|
|
'recall': float(rec), |
|
|
'f1': float(f1), |
|
|
'pr_auc': float(pr_auc), |
|
|
'roc_auc': float(roc), |
|
|
'confusion_matrix': cm |
|
|
}) |
|
|
models.append(clf) |
|
|
logger.info('Fold %d metrics: prec=%.3f rec=%.3f f1=%.3f pr_auc=%.3f', fold, prec, rec, f1, pr_auc) |
|
|
|
|
|
|
|
|
best_idx = int(np.argmax([m['f1'] for m in fold_metrics])) |
|
|
best_model = models[best_idx] |
|
|
|
|
|
|
|
|
agg = {} |
|
|
for k in ['precision', 'recall', 'f1', 'pr_auc', 'roc_auc']: |
|
|
vals = [fm[k] for fm in fold_metrics if not np.isnan(fm[k])] |
|
|
agg[f'{k}_mean'] = float(np.mean(vals)) if vals else float('nan') |
|
|
agg[f'{k}_std'] = float(np.std(vals)) if vals else float('nan') |
|
|
|
|
|
results = {'folds': fold_metrics, 'aggregate': agg} |
|
|
|
|
|
all_val_probas = np.array(all_val_probas) |
|
|
all_val_labels = np.array(all_val_labels) |
|
|
precision, recall, pr_thresholds = precision_recall_curve(all_val_labels, all_val_probas) |
|
|
|
|
|
results['pr_curve'] = { |
|
|
'precision': precision.tolist(), |
|
|
'recall': recall.tolist(), |
|
|
'thresholds': pr_thresholds.tolist() |
|
|
} |
|
|
|
|
|
|
|
|
ts = datetime.utcnow().isoformat() + 'Z' |
|
|
results['timestamp'] = ts |
|
|
results['seed'] = int(seed) |
|
|
results['features'] = list(X.columns) |
|
|
results['cv_validation_counts'] = int(len(all_val_labels)) |
|
|
|
|
|
models_dir = os.path.join('models') |
|
|
metrics_dir = os.path.join('metrics') |
|
|
os.makedirs(models_dir, exist_ok=True) |
|
|
os.makedirs(metrics_dir, exist_ok=True) |
|
|
|
|
|
model_path = os.path.join(models_dir, 'rf_model.joblib') |
|
|
metrics_path = os.path.join(metrics_dir, 'training_metrics.json') |
|
|
|
|
|
joblib.dump(best_model, model_path) |
|
|
with open(metrics_path, 'w') as fh: |
|
|
json.dump(results, fh, indent=2) |
|
|
logger.info('Training complete. Metrics saved to %s, model saved to %s', metrics_path, model_path) |
|
|
|
|
|
return best_model, results, X, y, all_val_probas, all_val_labels, encoder |
|
|
|