Spaces:
Configuration error
Configuration error
File size: 5,596 Bytes
7377758 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | """
retrain.py β Firewall Log Classifier
Trains the Random Forest model on Dataset__log2_.csv and saves:
- model.joblib
- scaler.joblib
- label_encoder.joblib
Run: python retrain.py
Requires: pip install scikit-learn imbalanced-learn pandas joblib
"""
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
DATASET_PATH = "Dataset__log2_.csv"
FEATURE_COLS = [
"Source Port",
"Destination Port",
"NAT Source Port",
"NAT Destination Port",
"Bytes",
"Bytes Sent",
"Bytes Received",
"Packets",
"Elapsed Time (sec)",
"pkts_sent",
"pkts_received",
]
TARGET_COL = "Action"
OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
# ββ 1. Load βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print(f"Loading {DATASET_PATH} ...")
df = pd.read_csv(DATASET_PATH)
print(f" Raw rows: {len(df):,} Columns: {list(df.columns)}")
# ββ 2. Verify columns exist βββββββββββββββββββββββββββββββββββββββββββββββββββ
missing = [c for c in FEATURE_COLS + [TARGET_COL] if c not in df.columns]
if missing:
print("\nERROR β these columns are missing from the CSV:")
for m in missing:
print(f" '{m}'")
print("\nAvailable columns:", list(df.columns))
raise SystemExit(1)
# ββ 3. Preprocessing ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
df.drop_duplicates(inplace=True)
print(f" After dedup: {len(df):,} rows")
# IQR filtering skipped β it eliminates the rare reset-both class entirely.
# SMOTE handles class imbalance instead.
X = df[FEATURE_COLS].values
y = df[TARGET_COL].values
print(f" Class distribution:\n{pd.Series(y).value_counts().to_string()}")
# ββ 4. Encode labels ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
le = LabelEncoder()
y_enc = le.fit_transform(y)
print(f" Classes: {list(le.classes_)}")
# ββ 5. Train / test split βββββββββββββββββββββββββββββββββββββββββββββββββββββ
X_train, X_test, y_train, y_test = train_test_split(
X, y_enc, test_size=0.30, stratify=y_enc, random_state=42
)
print(f" Train: {len(X_train):,} Test: {len(X_test):,}")
# ββ 6. SMOTE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("Applying SMOTE ...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print(f" After SMOTE train size: {len(X_train_res):,}")
# ββ 7. Scale ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_res)
X_test_sc = scaler.transform(X_test)
# ββ 8. Train tuned Random Forest ββββββββββββββββββββββββββββββββββββββββββββββ
print("Training Random Forest (n_estimators=200, max_depth=20) ...")
rf = RandomForestClassifier(
n_estimators=200,
max_depth=20,
min_samples_split=2,
random_state=42,
n_jobs=-1,
)
rf.fit(X_train_sc, y_train_res)
# ββ 9. Evaluate βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
y_pred = rf.predict(X_test_sc)
acc = accuracy_score(y_test, y_pred)
mf1 = f1_score(y_test, y_pred, average="macro")
print(f"\nTest Accuracy : {acc*100:.2f}%")
print(f"Macro F1 : {mf1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
# ββ 10. Save artifacts ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
model_path = os.path.join(OUTPUT_DIR, "model.joblib")
scaler_path = os.path.join(OUTPUT_DIR, "scaler.joblib")
le_path = os.path.join(OUTPUT_DIR, "label_encoder.joblib")
joblib.dump(rf, model_path)
joblib.dump(scaler, scaler_path)
joblib.dump(le, le_path)
print(f"\nSaved:")
print(f" {model_path}")
print(f" {scaler_path}")
print(f" {le_path}")
print("\nDone! Upload these 3 files to your repo alongside app.py.") |