mrsindhunugroho's picture
Upload folder using huggingface_hub
1162e72 verified
raw
history blame
14.1 kB
#!/usr/bin/env python3
"""
stacking_ensemble_safe.py (FINAL EXTENDED + FULL METRICS)
Stacking Ensemble: XGBoost, CatBoost, LightGBM, AdaBoost + RandomForest Meta Model
Features:
- Safe GPU fallback
- Full metrics logging (accuracy, precision, recall, f1, percentage, etc.)
- JSON-compatible for R Spider Chart
- Auto robustness_score & fold_variance
- Handles NaN, inf, weird column names, and file I/O issues
"""
import os, json, time, warnings, argparse, gc
from huggingface_hub import HfApi, upload_file, create_repo
import shutil
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
import joblib
warnings.filterwarnings("ignore")
# ==============================================================
# SAFE LOADING
# ==============================================================
def load_dataset(path, max_rows=500000):
ext = Path(path).suffix.lower()
print(f"[load_dataset] Loading: {path}")
try:
if ext == ".csv":
try:
df = pd.read_csv(path)
except MemoryError:
print(f"[load_dataset] MemoryError — loading first {max_rows} rows.")
df = pd.read_csv(path, nrows=max_rows)
elif ext in [".parquet", ".pq", ".parq"]:
df = pd.read_parquet(path)
else:
raise ValueError("Unsupported file format")
except Exception as e:
raise RuntimeError(f"[load_dataset] Failed to load dataset: {e}")
print(f"[load_dataset] Loaded {len(df)} rows × {len(df.columns)} columns.")
return df
# ==============================================================
# SANITIZE FEATURE NAMES
# ==============================================================
def sanitize_feature_names(df):
original = df.columns.tolist()
df.columns = (
df.columns.astype(str)
.str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
.str.strip('_')
)
renamed = {o: n for o, n in zip(original, df.columns) if o != n}
if renamed:
print(f"[sanitize_feature_names] Renamed {len(renamed)} columns for LightGBM safety.")
return df
# ==============================================================
# TARGET DETECTION
# ==============================================================
def detect_target_column(df):
candidates = ["label", "target", "class", "category", "attack", "output", "y"]
for c in df.columns:
if c.lower() in candidates:
return c
for c in df.columns:
if df[c].nunique() <= 50:
return c
return df.columns[-1]
# ==============================================================
# DATA PREP
# ==============================================================
def prep_data(df, target=None):
if target is None:
target = detect_target_column(df)
y = df[target]
X = df.drop(columns=[target])
le = LabelEncoder()
y = le.fit_transform(y.astype(str))
for col in X.select_dtypes(include=["object", "bool"]).columns:
X[col] = LabelEncoder().fit_transform(X[col].astype(str))
X = X.replace([np.inf, -np.inf], np.nan)
X = pd.DataFrame(SimpleImputer(strategy="mean").fit_transform(X), columns=X.columns)
X = sanitize_feature_names(X)
return X, y, target, le
# ==============================================================
# TRAIN BASE MODELS
# ==============================================================
def train_base_models(X_train, y_train, X_val):
try:
import cupy
gpu_ok = cupy.cuda.runtime.getDeviceCount() > 0
except Exception:
gpu_ok = False
device = "gpu" if gpu_ok else "cpu"
print(f"[train_base_models] Using {device.upper()} mode")
models, preds, times = {}, {}, {}
num_cls = len(np.unique(y_train))
def safe_train(name, fn):
try:
start = time.perf_counter()
print(f"[train_base_models] Training {name} ...")
m = fn()
dur = round(time.perf_counter() - start, 2)
times[name.lower()] = dur
print(f"[train_base_models] {name} done in {dur:.2f}s")
return m
except Exception as e:
print(f"[train_base_models] {name} failed: {e}")
times[name] = 0.0
return None
# XGBoost
xgb_fn = lambda: XGBClassifier(
n_estimators=50, learning_rate=0.3, max_depth=4,
tree_method="gpu_hist" if gpu_ok else "hist",
objective="binary:logistic" if num_cls == 2 else "multi:softmax",
num_class=num_cls if num_cls > 2 else None,
use_label_encoder=False, eval_metric="logloss", random_state=42, verbosity=0
).fit(X_train, y_train)
xgb = safe_train("XGBoost", xgb_fn)
if xgb: preds["xgboost"] = xgb.predict(X_val); models["xgboost"] = xgb
# CatBoost
cat_fn = lambda: CatBoostClassifier(
iterations=100, learning_rate=0.1, depth=6,
loss_function="Logloss" if num_cls == 2 else "MultiClass",
task_type="GPU" if gpu_ok else "CPU", verbose=False, random_seed=42
).fit(X_train, y_train)
cat = safe_train("CatBoost", cat_fn)
if cat: preds["catboost"] = cat.predict(X_val); models["catboost"] = cat
# LightGBM
lgb_fn = lambda: lgb.LGBMClassifier(
n_estimators=50, learning_rate=0.3, max_depth=4,
device="gpu" if gpu_ok else "cpu",
objective="binary" if num_cls == 2 else "multiclass",
num_class=num_cls if num_cls > 2 else None, random_state=42
).fit(X_train, y_train)
lgbm = safe_train("LightGBM", lgb_fn)
if lgbm: preds["lightgbm"] = lgbm.predict(X_val); models["lightgbm"] = lgbm
# AdaBoost
ada_fn = lambda: AdaBoostClassifier(
estimator=DecisionTreeClassifier(max_depth=3),
n_estimators=50, random_state=42
).fit(X_train, y_train)
ada = safe_train("AdaBoost", ada_fn)
if ada: preds["adaboost"] = ada.predict(X_val); models["adaboost"] = ada
gc.collect()
return models, preds, times
# ==============================================================
# OOF STACKING (WITH FULL METRICS)
# ==============================================================
def oof_stacking(X, y, n_folds=5):
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
oof, folds = {}, []
for k in ["xgboost", "catboost", "lightgbm", "adaboost"]:
oof[k] = np.zeros(len(y), dtype=np.int32)
for i, (tr, val) in enumerate(skf.split(X, y), start=1):
print(f"\n[oof_stacking] ==== Fold {i}/{n_folds} ====")
X_tr, X_val, y_tr, y_val = X.iloc[tr], X.iloc[val], y[tr], y[val]
try:
models, preds, times = train_base_models(X_tr, y_tr, X_val)
except Exception as e:
print(f"[Fold {i}] Fold skipped: {e}")
continue
fold_metrics = {}
for name, y_pred in preds.items():
y_pred = np.ravel(y_pred)
oof[name][val] = y_pred
acc = accuracy_score(y_val, y_pred)
pre = precision_score(y_val, y_pred, average='weighted', zero_division=0)
rec = recall_score(y_val, y_pred, average='weighted', zero_division=0)
f1v = f1_score(y_val, y_pred, average='weighted', zero_division=0)
total_v = int((y_pred != 0).sum())
pct = round(total_v / len(y_pred) * 100, 4)
is_vul = bool(total_v > 0)
fold_metrics[name] = {
"accuracy": float(acc),
"precision": float(pre),
"recall": float(rec),
"f1": float(f1v),
"total_vulnerable": total_v,
"percentage": pct,
"is_vulnerable": is_vul,
"train_time_sec": float(times.get(name.lower(), 0.0))
}
print(f"[Fold {i}] {name}: acc={acc:.4f}, f1={f1v:.4f}, vuln={pct}%")
folds.append({"fold": i, "metrics": fold_metrics})
print("[oof_stacking] Completed all folds.")
return oof, folds
# ==============================================================
# META MODEL & EVALUATION
# ==============================================================
def train_meta_model(oof_preds, y):
meta_X = np.column_stack([oof_preds[k] for k in oof_preds])
meta = RandomForestClassifier(n_estimators=50, random_state=42, max_features="sqrt")
meta.fit(meta_X, y)
return meta
def evaluate(models, meta, X_test, y_test, times):
results = {}
for name, m in models.items():
y_pred = m.predict(X_test)
acc = accuracy_score(y_test, y_pred)
pre = precision_score(y_test, y_pred, average='weighted', zero_division=0)
rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1v = f1_score(y_test, y_pred, average='weighted', zero_division=0)
total_v = int((y_pred != 0).sum())
pct = round(total_v / len(y_pred) * 100, 4)
is_vul = bool(total_v > 0)
results[name] = {
"accuracy": acc, "precision": pre, "recall": rec, "f1": f1v,
"total_vulnerable": total_v, "percentage": pct, "is_vulnerable": is_vul, "train_time_sec": float(times.get(name.lower(), 0.0))
}
print(f"[evaluate] {name}: acc={acc:.4f}, f1={f1v:.4f}, vuln={pct}%")
meta_X = np.column_stack([models[k].predict(X_test) for k in models])
y_meta = meta.predict(meta_X)
results["meta_model"] = {
"accuracy": accuracy_score(y_test, y_meta),
"precision": precision_score(y_test, y_meta, average='weighted', zero_division=0),
"recall": recall_score(y_test, y_meta, average='weighted', zero_division=0),
"f1": f1_score(y_test, y_meta, average='weighted', zero_division=0)
}
return results
# ==============================================================
# SAVE SUMMARY
# ==============================================================
def save_summary_json(outdir, target, nrows, class_labels, folds, results):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
# Calculate fold variance & robustness
fold_acc = [np.mean([m["accuracy"] for m in f["metrics"].values()]) for f in folds]
fold_variance = float(np.var(fold_acc))
robustness_score = float(1 - fold_variance)
summary = {
"target_column": target,
"rows": int(nrows),
"folds": folds,
"final_results": results,
"class_labels": list(class_labels),
"fold_variance": round(fold_variance, 6),
"robustness_score": round(robustness_score, 6)
}
path = outdir / "summary.json"
with open(path, "w") as f:
json.dump(summary, f, indent=2)
print(f"[save_summary_json] Saved to {path}")
# ==============================================================
# HUGGINGFACE UPLOAD
# ==============================================================
# ==============================================================
# SAVE MODELS LOCALLY
# ==============================================================
def save_models(models, meta_model, outdir):
model_dir = os.path.join(outdir, "models")
os.makedirs(model_dir, exist_ok=True)
for name, model in models.items():
joblib.dump(model, os.path.join(model_dir, f"{name}_model.pkl"))
joblib.dump(meta_model, os.path.join(model_dir, "meta_model.pkl"))
print(f"[save_models] All base and meta models saved to {model_dir}")
return model_dir
# ==============================================================
# MAIN
# ==============================================================
def main(args):
start = time.perf_counter()
df = load_dataset(args.dataset)
X, y, target, le = prep_data(df, args.target_label)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=args.test_size, random_state=42, stratify=y if len(np.unique(y))>1 else None
)
oof_preds, folds = oof_stacking(X_train, y_train, n_folds=args.n_folds)
meta = train_meta_model(oof_preds, y_train)
models, _, times = train_base_models(X_train, y_train, X_test)
results = evaluate(models, meta, X_test, y_test, times)
# === Simpan Model dan Hasil Analisis ===
save_models(models, meta, args.outdir)
save_summary_json(args.outdir, target, len(df), le.classes_, folds, results)
# === Hitung total waktu training dan evaluasi ===
total_time = round(time.perf_counter() - start, 2)
print(f"\n Completed in {total_time} sec")
# === Simpan ke JSON dengan waktu total ===
save_summary_json(args.outdir, target, len(df), le.classes_, folds, results)
# Tambahkan waktu total ke JSON yang sudah tersimpan
summary_path = Path(args.outdir) / "summary.json"
if summary_path.exists():
with open(summary_path, "r+") as f:
data = json.load(f)
data["total_train_time_sec"] = total_time
f.seek(0)
json.dump(data, f, indent=2)
f.truncate()
print(f"[save_summary_json] total_train_time_sec={total_time} saved.")
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("--dataset", required=True)
p.add_argument("--outdir", required=True)
p.add_argument("--target-label", default=None)
p.add_argument("--test-size", type=float, default=0.2)
p.add_argument("--n-folds", type=int, default=5)
args = p.parse_args()
main(args)