File size: 5,596 Bytes
7377758
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""

retrain.py β€” Firewall Log Classifier

Trains the Random Forest model on Dataset__log2_.csv and saves:

  - model.joblib

  - scaler.joblib

  - label_encoder.joblib



Run:  python retrain.py

Requires: pip install scikit-learn imbalanced-learn pandas joblib

"""

import os
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

# ── Config ────────────────────────────────────────────────────────────────────
DATASET_PATH = "Dataset__log2_.csv"

FEATURE_COLS = [
    "Source Port",
    "Destination Port",
    "NAT Source Port",
    "NAT Destination Port",
    "Bytes",
    "Bytes Sent",
    "Bytes Received",
    "Packets",
    "Elapsed Time (sec)",
    "pkts_sent",
    "pkts_received",
]

TARGET_COL = "Action"

OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))

# ── 1. Load ───────────────────────────────────────────────────────────────────
print(f"Loading {DATASET_PATH} ...")
df = pd.read_csv(DATASET_PATH)
print(f"  Raw rows: {len(df):,}   Columns: {list(df.columns)}")

# ── 2. Verify columns exist ───────────────────────────────────────────────────
missing = [c for c in FEATURE_COLS + [TARGET_COL] if c not in df.columns]
if missing:
    print("\nERROR β€” these columns are missing from the CSV:")
    for m in missing:
        print(f"  '{m}'")
    print("\nAvailable columns:", list(df.columns))
    raise SystemExit(1)

# ── 3. Preprocessing ──────────────────────────────────────────────────────────
df.drop_duplicates(inplace=True)
print(f"  After dedup: {len(df):,} rows")
# IQR filtering skipped β€” it eliminates the rare reset-both class entirely.
# SMOTE handles class imbalance instead.

X = df[FEATURE_COLS].values
y = df[TARGET_COL].values

print(f"  Class distribution:\n{pd.Series(y).value_counts().to_string()}")

# ── 4. Encode labels ──────────────────────────────────────────────────────────
le = LabelEncoder()
y_enc = le.fit_transform(y)
print(f"  Classes: {list(le.classes_)}")

# ── 5. Train / test split ─────────────────────────────────────────────────────
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.30, stratify=y_enc, random_state=42
)
print(f"  Train: {len(X_train):,}   Test: {len(X_test):,}")

# ── 6. SMOTE ──────────────────────────────────────────────────────────────────
print("Applying SMOTE ...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print(f"  After SMOTE train size: {len(X_train_res):,}")

# ── 7. Scale ──────────────────────────────────────────────────────────────────
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_res)
X_test_sc  = scaler.transform(X_test)

# ── 8. Train tuned Random Forest ──────────────────────────────────────────────
print("Training Random Forest (n_estimators=200, max_depth=20) ...")
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=2,
    random_state=42,
    n_jobs=-1,
)
rf.fit(X_train_sc, y_train_res)

# ── 9. Evaluate ───────────────────────────────────────────────────────────────
y_pred = rf.predict(X_test_sc)
acc  = accuracy_score(y_test, y_pred)
mf1  = f1_score(y_test, y_pred, average="macro")
print(f"\nTest Accuracy : {acc*100:.2f}%")
print(f"Macro F1      : {mf1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# ── 10. Save artifacts ────────────────────────────────────────────────────────
model_path   = os.path.join(OUTPUT_DIR, "model.joblib")
scaler_path  = os.path.join(OUTPUT_DIR, "scaler.joblib")
le_path      = os.path.join(OUTPUT_DIR, "label_encoder.joblib")

joblib.dump(rf,     model_path)
joblib.dump(scaler, scaler_path)
joblib.dump(le,     le_path)

print(f"\nSaved:")
print(f"  {model_path}")
print(f"  {scaler_path}")
print(f"  {le_path}")
print("\nDone! Upload these 3 files to your repo alongside app.py.")