|
|
|
|
|
"""
|
|
|
stacking_ensemble_safe.py (FINAL EXTENDED + FULL METRICS)
|
|
|
Stacking Ensemble: XGBoost, CatBoost, LightGBM, AdaBoost + RandomForest Meta Model
|
|
|
|
|
|
Features:
|
|
|
- Safe GPU fallback
|
|
|
- Full metrics logging (accuracy, precision, recall, f1, percentage, etc.)
|
|
|
- JSON-compatible for R Spider Chart
|
|
|
- Auto robustness_score & fold_variance
|
|
|
- Handles NaN, inf, weird column names, and file I/O issues
|
|
|
"""
|
|
|
|
|
|
import os, json, time, warnings, argparse, gc
|
|
|
from huggingface_hub import HfApi, upload_file, create_repo
|
|
|
import shutil
|
|
|
from pathlib import Path
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
from sklearn.model_selection import train_test_split, StratifiedKFold
|
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
from sklearn.impute import SimpleImputer
|
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
|
|
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
|
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
|
from xgboost import XGBClassifier
|
|
|
from catboost import CatBoostClassifier
|
|
|
import lightgbm as lgb
|
|
|
import joblib
|
|
|
|
|
|
warnings.filterwarnings("ignore")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_dataset(path, max_rows=500000):
|
|
|
ext = Path(path).suffix.lower()
|
|
|
print(f"[load_dataset] Loading: {path}")
|
|
|
try:
|
|
|
if ext == ".csv":
|
|
|
try:
|
|
|
df = pd.read_csv(path)
|
|
|
except MemoryError:
|
|
|
print(f"[load_dataset] MemoryError — loading first {max_rows} rows.")
|
|
|
df = pd.read_csv(path, nrows=max_rows)
|
|
|
elif ext in [".parquet", ".pq", ".parq"]:
|
|
|
df = pd.read_parquet(path)
|
|
|
else:
|
|
|
raise ValueError("Unsupported file format")
|
|
|
except Exception as e:
|
|
|
raise RuntimeError(f"[load_dataset] Failed to load dataset: {e}")
|
|
|
print(f"[load_dataset] Loaded {len(df)} rows × {len(df.columns)} columns.")
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sanitize_feature_names(df):
|
|
|
original = df.columns.tolist()
|
|
|
df.columns = (
|
|
|
df.columns.astype(str)
|
|
|
.str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
|
|
|
.str.strip('_')
|
|
|
)
|
|
|
renamed = {o: n for o, n in zip(original, df.columns) if o != n}
|
|
|
if renamed:
|
|
|
print(f"[sanitize_feature_names] Renamed {len(renamed)} columns for LightGBM safety.")
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_target_column(df):
|
|
|
candidates = ["label", "target", "class", "category", "attack", "output", "y"]
|
|
|
for c in df.columns:
|
|
|
if c.lower() in candidates:
|
|
|
return c
|
|
|
for c in df.columns:
|
|
|
if df[c].nunique() <= 50:
|
|
|
return c
|
|
|
return df.columns[-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def prep_data(df, target=None):
|
|
|
if target is None:
|
|
|
target = detect_target_column(df)
|
|
|
y = df[target]
|
|
|
X = df.drop(columns=[target])
|
|
|
|
|
|
le = LabelEncoder()
|
|
|
y = le.fit_transform(y.astype(str))
|
|
|
|
|
|
for col in X.select_dtypes(include=["object", "bool"]).columns:
|
|
|
X[col] = LabelEncoder().fit_transform(X[col].astype(str))
|
|
|
|
|
|
X = X.replace([np.inf, -np.inf], np.nan)
|
|
|
X = pd.DataFrame(SimpleImputer(strategy="mean").fit_transform(X), columns=X.columns)
|
|
|
X = sanitize_feature_names(X)
|
|
|
return X, y, target, le
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_base_models(X_train, y_train, X_val):
|
|
|
try:
|
|
|
import cupy
|
|
|
gpu_ok = cupy.cuda.runtime.getDeviceCount() > 0
|
|
|
except Exception:
|
|
|
gpu_ok = False
|
|
|
|
|
|
device = "gpu" if gpu_ok else "cpu"
|
|
|
print(f"[train_base_models] Using {device.upper()} mode")
|
|
|
|
|
|
models, preds, times = {}, {}, {}
|
|
|
num_cls = len(np.unique(y_train))
|
|
|
|
|
|
def safe_train(name, fn):
|
|
|
try:
|
|
|
start = time.perf_counter()
|
|
|
print(f"[train_base_models] Training {name} ...")
|
|
|
m = fn()
|
|
|
dur = round(time.perf_counter() - start, 2)
|
|
|
times[name.lower()] = dur
|
|
|
print(f"[train_base_models] {name} done in {dur:.2f}s")
|
|
|
return m
|
|
|
except Exception as e:
|
|
|
print(f"[train_base_models] {name} failed: {e}")
|
|
|
times[name] = 0.0
|
|
|
return None
|
|
|
|
|
|
|
|
|
xgb_fn = lambda: XGBClassifier(
|
|
|
n_estimators=50, learning_rate=0.3, max_depth=4,
|
|
|
tree_method="gpu_hist" if gpu_ok else "hist",
|
|
|
objective="binary:logistic" if num_cls == 2 else "multi:softmax",
|
|
|
num_class=num_cls if num_cls > 2 else None,
|
|
|
use_label_encoder=False, eval_metric="logloss", random_state=42, verbosity=0
|
|
|
).fit(X_train, y_train)
|
|
|
xgb = safe_train("XGBoost", xgb_fn)
|
|
|
if xgb: preds["xgboost"] = xgb.predict(X_val); models["xgboost"] = xgb
|
|
|
|
|
|
|
|
|
cat_fn = lambda: CatBoostClassifier(
|
|
|
iterations=100, learning_rate=0.1, depth=6,
|
|
|
loss_function="Logloss" if num_cls == 2 else "MultiClass",
|
|
|
task_type="GPU" if gpu_ok else "CPU", verbose=False, random_seed=42
|
|
|
).fit(X_train, y_train)
|
|
|
cat = safe_train("CatBoost", cat_fn)
|
|
|
if cat: preds["catboost"] = cat.predict(X_val); models["catboost"] = cat
|
|
|
|
|
|
|
|
|
lgb_fn = lambda: lgb.LGBMClassifier(
|
|
|
n_estimators=50, learning_rate=0.3, max_depth=4,
|
|
|
device="gpu" if gpu_ok else "cpu",
|
|
|
objective="binary" if num_cls == 2 else "multiclass",
|
|
|
num_class=num_cls if num_cls > 2 else None, random_state=42
|
|
|
).fit(X_train, y_train)
|
|
|
lgbm = safe_train("LightGBM", lgb_fn)
|
|
|
if lgbm: preds["lightgbm"] = lgbm.predict(X_val); models["lightgbm"] = lgbm
|
|
|
|
|
|
|
|
|
ada_fn = lambda: AdaBoostClassifier(
|
|
|
estimator=DecisionTreeClassifier(max_depth=3),
|
|
|
n_estimators=50, random_state=42
|
|
|
).fit(X_train, y_train)
|
|
|
ada = safe_train("AdaBoost", ada_fn)
|
|
|
if ada: preds["adaboost"] = ada.predict(X_val); models["adaboost"] = ada
|
|
|
|
|
|
gc.collect()
|
|
|
return models, preds, times
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def oof_stacking(X, y, n_folds=5):
|
|
|
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
|
oof, folds = {}, []
|
|
|
for k in ["xgboost", "catboost", "lightgbm", "adaboost"]:
|
|
|
oof[k] = np.zeros(len(y), dtype=np.int32)
|
|
|
|
|
|
for i, (tr, val) in enumerate(skf.split(X, y), start=1):
|
|
|
print(f"\n[oof_stacking] ==== Fold {i}/{n_folds} ====")
|
|
|
X_tr, X_val, y_tr, y_val = X.iloc[tr], X.iloc[val], y[tr], y[val]
|
|
|
try:
|
|
|
models, preds, times = train_base_models(X_tr, y_tr, X_val)
|
|
|
except Exception as e:
|
|
|
print(f"[Fold {i}] Fold skipped: {e}")
|
|
|
continue
|
|
|
|
|
|
fold_metrics = {}
|
|
|
for name, y_pred in preds.items():
|
|
|
y_pred = np.ravel(y_pred)
|
|
|
oof[name][val] = y_pred
|
|
|
acc = accuracy_score(y_val, y_pred)
|
|
|
pre = precision_score(y_val, y_pred, average='weighted', zero_division=0)
|
|
|
rec = recall_score(y_val, y_pred, average='weighted', zero_division=0)
|
|
|
f1v = f1_score(y_val, y_pred, average='weighted', zero_division=0)
|
|
|
total_v = int((y_pred != 0).sum())
|
|
|
pct = round(total_v / len(y_pred) * 100, 4)
|
|
|
is_vul = bool(total_v > 0)
|
|
|
|
|
|
fold_metrics[name] = {
|
|
|
"accuracy": float(acc),
|
|
|
"precision": float(pre),
|
|
|
"recall": float(rec),
|
|
|
"f1": float(f1v),
|
|
|
"total_vulnerable": total_v,
|
|
|
"percentage": pct,
|
|
|
"is_vulnerable": is_vul,
|
|
|
"train_time_sec": float(times.get(name.lower(), 0.0))
|
|
|
}
|
|
|
print(f"[Fold {i}] {name}: acc={acc:.4f}, f1={f1v:.4f}, vuln={pct}%")
|
|
|
|
|
|
folds.append({"fold": i, "metrics": fold_metrics})
|
|
|
print("[oof_stacking] Completed all folds.")
|
|
|
return oof, folds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_meta_model(oof_preds, y):
|
|
|
meta_X = np.column_stack([oof_preds[k] for k in oof_preds])
|
|
|
meta = RandomForestClassifier(n_estimators=50, random_state=42, max_features="sqrt")
|
|
|
meta.fit(meta_X, y)
|
|
|
return meta
|
|
|
|
|
|
def evaluate(models, meta, X_test, y_test, times):
|
|
|
results = {}
|
|
|
for name, m in models.items():
|
|
|
y_pred = m.predict(X_test)
|
|
|
acc = accuracy_score(y_test, y_pred)
|
|
|
pre = precision_score(y_test, y_pred, average='weighted', zero_division=0)
|
|
|
rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
|
|
|
f1v = f1_score(y_test, y_pred, average='weighted', zero_division=0)
|
|
|
total_v = int((y_pred != 0).sum())
|
|
|
pct = round(total_v / len(y_pred) * 100, 4)
|
|
|
is_vul = bool(total_v > 0)
|
|
|
results[name] = {
|
|
|
"accuracy": acc, "precision": pre, "recall": rec, "f1": f1v,
|
|
|
"total_vulnerable": total_v, "percentage": pct, "is_vulnerable": is_vul, "train_time_sec": float(times.get(name.lower(), 0.0))
|
|
|
}
|
|
|
print(f"[evaluate] {name}: acc={acc:.4f}, f1={f1v:.4f}, vuln={pct}%")
|
|
|
|
|
|
meta_X = np.column_stack([models[k].predict(X_test) for k in models])
|
|
|
y_meta = meta.predict(meta_X)
|
|
|
results["meta_model"] = {
|
|
|
"accuracy": accuracy_score(y_test, y_meta),
|
|
|
"precision": precision_score(y_test, y_meta, average='weighted', zero_division=0),
|
|
|
"recall": recall_score(y_test, y_meta, average='weighted', zero_division=0),
|
|
|
"f1": f1_score(y_test, y_meta, average='weighted', zero_division=0)
|
|
|
}
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_summary_json(outdir, target, nrows, class_labels, folds, results):
|
|
|
outdir = Path(outdir)
|
|
|
outdir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
fold_acc = [np.mean([m["accuracy"] for m in f["metrics"].values()]) for f in folds]
|
|
|
fold_variance = float(np.var(fold_acc))
|
|
|
robustness_score = float(1 - fold_variance)
|
|
|
|
|
|
summary = {
|
|
|
"target_column": target,
|
|
|
"rows": int(nrows),
|
|
|
"folds": folds,
|
|
|
"final_results": results,
|
|
|
"class_labels": list(class_labels),
|
|
|
"fold_variance": round(fold_variance, 6),
|
|
|
"robustness_score": round(robustness_score, 6)
|
|
|
}
|
|
|
path = outdir / "summary.json"
|
|
|
with open(path, "w") as f:
|
|
|
json.dump(summary, f, indent=2)
|
|
|
print(f"[save_summary_json] Saved to {path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_models(models, meta_model, outdir):
|
|
|
model_dir = os.path.join(outdir, "models")
|
|
|
os.makedirs(model_dir, exist_ok=True)
|
|
|
|
|
|
for name, model in models.items():
|
|
|
joblib.dump(model, os.path.join(model_dir, f"{name}_model.pkl"))
|
|
|
joblib.dump(meta_model, os.path.join(model_dir, "meta_model.pkl"))
|
|
|
|
|
|
print(f"[save_models] All base and meta models saved to {model_dir}")
|
|
|
return model_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(args):
|
|
|
start = time.perf_counter()
|
|
|
df = load_dataset(args.dataset)
|
|
|
X, y, target, le = prep_data(df, args.target_label)
|
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
|
X, y, test_size=args.test_size, random_state=42, stratify=y if len(np.unique(y))>1 else None
|
|
|
)
|
|
|
oof_preds, folds = oof_stacking(X_train, y_train, n_folds=args.n_folds)
|
|
|
meta = train_meta_model(oof_preds, y_train)
|
|
|
models, _, times = train_base_models(X_train, y_train, X_test)
|
|
|
results = evaluate(models, meta, X_test, y_test, times)
|
|
|
|
|
|
|
|
|
save_models(models, meta, args.outdir)
|
|
|
save_summary_json(args.outdir, target, len(df), le.classes_, folds, results)
|
|
|
|
|
|
|
|
|
total_time = round(time.perf_counter() - start, 2)
|
|
|
print(f"\n Completed in {total_time} sec")
|
|
|
|
|
|
|
|
|
save_summary_json(args.outdir, target, len(df), le.classes_, folds, results)
|
|
|
|
|
|
|
|
|
summary_path = Path(args.outdir) / "summary.json"
|
|
|
if summary_path.exists():
|
|
|
with open(summary_path, "r+") as f:
|
|
|
data = json.load(f)
|
|
|
data["total_train_time_sec"] = total_time
|
|
|
f.seek(0)
|
|
|
json.dump(data, f, indent=2)
|
|
|
f.truncate()
|
|
|
print(f"[save_summary_json] total_train_time_sec={total_time} saved.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
p = argparse.ArgumentParser()
|
|
|
p.add_argument("--dataset", required=True)
|
|
|
p.add_argument("--outdir", required=True)
|
|
|
p.add_argument("--target-label", default=None)
|
|
|
p.add_argument("--test-size", type=float, default=0.2)
|
|
|
p.add_argument("--n-folds", type=int, default=5)
|
|
|
args = p.parse_args()
|
|
|
main(args)
|
|
|
|