#!/usr/bin/env python3 """ stacking_ensemble_safe.py (FINAL EXTENDED + FULL METRICS) Stacking Ensemble: XGBoost, CatBoost, LightGBM, AdaBoost + RandomForest Meta Model Features: - Safe GPU fallback - Full metrics logging (accuracy, precision, recall, f1, percentage, etc.) - JSON-compatible for R Spider Chart - Auto robustness_score & fold_variance - Handles NaN, inf, weird column names, and file I/O issues """ import os, json, time, warnings, argparse, gc from huggingface_hub import HfApi, upload_file, create_repo import shutil from pathlib import Path import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, StratifiedKFold from sklearn.preprocessing import LabelEncoder from sklearn.impute import SimpleImputer from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from xgboost import XGBClassifier from catboost import CatBoostClassifier import lightgbm as lgb import joblib warnings.filterwarnings("ignore") # ============================================================== # SAFE LOADING # ============================================================== def load_dataset(path, max_rows=500000): ext = Path(path).suffix.lower() print(f"[load_dataset] Loading: {path}") try: if ext == ".csv": try: df = pd.read_csv(path) except MemoryError: print(f"[load_dataset] MemoryError — loading first {max_rows} rows.") df = pd.read_csv(path, nrows=max_rows) elif ext in [".parquet", ".pq", ".parq"]: df = pd.read_parquet(path) else: raise ValueError("Unsupported file format") except Exception as e: raise RuntimeError(f"[load_dataset] Failed to load dataset: {e}") print(f"[load_dataset] Loaded {len(df)} rows × {len(df.columns)} columns.") return df # ============================================================== # SANITIZE FEATURE NAMES # ============================================================== def sanitize_feature_names(df): original = df.columns.tolist() df.columns = ( df.columns.astype(str) .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True) .str.strip('_') ) renamed = {o: n for o, n in zip(original, df.columns) if o != n} if renamed: print(f"[sanitize_feature_names] Renamed {len(renamed)} columns for LightGBM safety.") return df # ============================================================== # TARGET DETECTION # ============================================================== def detect_target_column(df): candidates = ["label", "target", "class", "category", "attack", "output", "y"] for c in df.columns: if c.lower() in candidates: return c for c in df.columns: if df[c].nunique() <= 50: return c return df.columns[-1] # ============================================================== # DATA PREP # ============================================================== def prep_data(df, target=None): if target is None: target = detect_target_column(df) y = df[target] X = df.drop(columns=[target]) le = LabelEncoder() y = le.fit_transform(y.astype(str)) for col in X.select_dtypes(include=["object", "bool"]).columns: X[col] = LabelEncoder().fit_transform(X[col].astype(str)) X = X.replace([np.inf, -np.inf], np.nan) X = pd.DataFrame(SimpleImputer(strategy="mean").fit_transform(X), columns=X.columns) X = sanitize_feature_names(X) return X, y, target, le # ============================================================== # TRAIN BASE MODELS # ============================================================== def train_base_models(X_train, y_train, X_val): try: import cupy gpu_ok = cupy.cuda.runtime.getDeviceCount() > 0 except Exception: gpu_ok = False device = "gpu" if gpu_ok else "cpu" print(f"[train_base_models] Using {device.upper()} mode") models, preds, times = {}, {}, {} num_cls = len(np.unique(y_train)) def safe_train(name, fn): try: start = time.perf_counter() print(f"[train_base_models] Training {name} ...") m = fn() dur = round(time.perf_counter() - start, 2) times[name.lower()] = dur print(f"[train_base_models] {name} done in {dur:.2f}s") return m except Exception as e: print(f"[train_base_models] {name} failed: {e}") times[name] = 0.0 return None # XGBoost xgb_fn = lambda: XGBClassifier( n_estimators=50, learning_rate=0.3, max_depth=4, tree_method="gpu_hist" if gpu_ok else "hist", objective="binary:logistic" if num_cls == 2 else "multi:softmax", num_class=num_cls if num_cls > 2 else None, use_label_encoder=False, eval_metric="logloss", random_state=42, verbosity=0 ).fit(X_train, y_train) xgb = safe_train("XGBoost", xgb_fn) if xgb: preds["xgboost"] = xgb.predict(X_val); models["xgboost"] = xgb # CatBoost cat_fn = lambda: CatBoostClassifier( iterations=100, learning_rate=0.1, depth=6, loss_function="Logloss" if num_cls == 2 else "MultiClass", task_type="GPU" if gpu_ok else "CPU", verbose=False, random_seed=42 ).fit(X_train, y_train) cat = safe_train("CatBoost", cat_fn) if cat: preds["catboost"] = cat.predict(X_val); models["catboost"] = cat # LightGBM lgb_fn = lambda: lgb.LGBMClassifier( n_estimators=50, learning_rate=0.3, max_depth=4, device="gpu" if gpu_ok else "cpu", objective="binary" if num_cls == 2 else "multiclass", num_class=num_cls if num_cls > 2 else None, random_state=42 ).fit(X_train, y_train) lgbm = safe_train("LightGBM", lgb_fn) if lgbm: preds["lightgbm"] = lgbm.predict(X_val); models["lightgbm"] = lgbm # AdaBoost ada_fn = lambda: AdaBoostClassifier( estimator=DecisionTreeClassifier(max_depth=3), n_estimators=50, random_state=42 ).fit(X_train, y_train) ada = safe_train("AdaBoost", ada_fn) if ada: preds["adaboost"] = ada.predict(X_val); models["adaboost"] = ada gc.collect() return models, preds, times # ============================================================== # OOF STACKING (WITH FULL METRICS) # ============================================================== def oof_stacking(X, y, n_folds=5): skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42) oof, folds = {}, [] for k in ["xgboost", "catboost", "lightgbm", "adaboost"]: oof[k] = np.zeros(len(y), dtype=np.int32) for i, (tr, val) in enumerate(skf.split(X, y), start=1): print(f"\n[oof_stacking] ==== Fold {i}/{n_folds} ====") X_tr, X_val, y_tr, y_val = X.iloc[tr], X.iloc[val], y[tr], y[val] try: models, preds, times = train_base_models(X_tr, y_tr, X_val) except Exception as e: print(f"[Fold {i}] Fold skipped: {e}") continue fold_metrics = {} for name, y_pred in preds.items(): y_pred = np.ravel(y_pred) oof[name][val] = y_pred acc = accuracy_score(y_val, y_pred) pre = precision_score(y_val, y_pred, average='weighted', zero_division=0) rec = recall_score(y_val, y_pred, average='weighted', zero_division=0) f1v = f1_score(y_val, y_pred, average='weighted', zero_division=0) total_v = int((y_pred != 0).sum()) pct = round(total_v / len(y_pred) * 100, 4) is_vul = bool(total_v > 0) fold_metrics[name] = { "accuracy": float(acc), "precision": float(pre), "recall": float(rec), "f1": float(f1v), "total_vulnerable": total_v, "percentage": pct, "is_vulnerable": is_vul, "train_time_sec": float(times.get(name.lower(), 0.0)) } print(f"[Fold {i}] {name}: acc={acc:.4f}, f1={f1v:.4f}, vuln={pct}%") folds.append({"fold": i, "metrics": fold_metrics}) print("[oof_stacking] Completed all folds.") return oof, folds # ============================================================== # META MODEL & EVALUATION # ============================================================== def train_meta_model(oof_preds, y): meta_X = np.column_stack([oof_preds[k] for k in oof_preds]) meta = RandomForestClassifier(n_estimators=50, random_state=42, max_features="sqrt") meta.fit(meta_X, y) return meta def evaluate(models, meta, X_test, y_test, times): results = {} for name, m in models.items(): y_pred = m.predict(X_test) acc = accuracy_score(y_test, y_pred) pre = precision_score(y_test, y_pred, average='weighted', zero_division=0) rec = recall_score(y_test, y_pred, average='weighted', zero_division=0) f1v = f1_score(y_test, y_pred, average='weighted', zero_division=0) total_v = int((y_pred != 0).sum()) pct = round(total_v / len(y_pred) * 100, 4) is_vul = bool(total_v > 0) results[name] = { "accuracy": acc, "precision": pre, "recall": rec, "f1": f1v, "total_vulnerable": total_v, "percentage": pct, "is_vulnerable": is_vul, "train_time_sec": float(times.get(name.lower(), 0.0)) } print(f"[evaluate] {name}: acc={acc:.4f}, f1={f1v:.4f}, vuln={pct}%") meta_X = np.column_stack([models[k].predict(X_test) for k in models]) y_meta = meta.predict(meta_X) results["meta_model"] = { "accuracy": accuracy_score(y_test, y_meta), "precision": precision_score(y_test, y_meta, average='weighted', zero_division=0), "recall": recall_score(y_test, y_meta, average='weighted', zero_division=0), "f1": f1_score(y_test, y_meta, average='weighted', zero_division=0) } return results # ============================================================== # SAVE SUMMARY # ============================================================== def save_summary_json(outdir, target, nrows, class_labels, folds, results): outdir = Path(outdir) outdir.mkdir(parents=True, exist_ok=True) # Calculate fold variance & robustness fold_acc = [np.mean([m["accuracy"] for m in f["metrics"].values()]) for f in folds] fold_variance = float(np.var(fold_acc)) robustness_score = float(1 - fold_variance) summary = { "target_column": target, "rows": int(nrows), "folds": folds, "final_results": results, "class_labels": list(class_labels), "fold_variance": round(fold_variance, 6), "robustness_score": round(robustness_score, 6) } path = outdir / "summary.json" with open(path, "w") as f: json.dump(summary, f, indent=2) print(f"[save_summary_json] Saved to {path}") # ============================================================== # HUGGINGFACE UPLOAD # ============================================================== # ============================================================== # SAVE MODELS LOCALLY # ============================================================== def save_models(models, meta_model, outdir): model_dir = os.path.join(outdir, "models") os.makedirs(model_dir, exist_ok=True) for name, model in models.items(): joblib.dump(model, os.path.join(model_dir, f"{name}_model.pkl")) joblib.dump(meta_model, os.path.join(model_dir, "meta_model.pkl")) print(f"[save_models] All base and meta models saved to {model_dir}") return model_dir # ============================================================== # MAIN # ============================================================== def main(args): start = time.perf_counter() df = load_dataset(args.dataset) X, y, target, le = prep_data(df, args.target_label) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=args.test_size, random_state=42, stratify=y if len(np.unique(y))>1 else None ) oof_preds, folds = oof_stacking(X_train, y_train, n_folds=args.n_folds) meta = train_meta_model(oof_preds, y_train) models, _, times = train_base_models(X_train, y_train, X_test) results = evaluate(models, meta, X_test, y_test, times) # === Simpan Model dan Hasil Analisis === save_models(models, meta, args.outdir) save_summary_json(args.outdir, target, len(df), le.classes_, folds, results) # === Hitung total waktu training dan evaluasi === total_time = round(time.perf_counter() - start, 2) print(f"\n Completed in {total_time} sec") # === Simpan ke JSON dengan waktu total === save_summary_json(args.outdir, target, len(df), le.classes_, folds, results) # Tambahkan waktu total ke JSON yang sudah tersimpan summary_path = Path(args.outdir) / "summary.json" if summary_path.exists(): with open(summary_path, "r+") as f: data = json.load(f) data["total_train_time_sec"] = total_time f.seek(0) json.dump(data, f, indent=2) f.truncate() print(f"[save_summary_json] total_train_time_sec={total_time} saved.") if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument("--dataset", required=True) p.add_argument("--outdir", required=True) p.add_argument("--target-label", default=None) p.add_argument("--test-size", type=float, default=0.2) p.add_argument("--n-folds", type=int, default=5) args = p.parse_args() main(args)