yomnafarag95 commited on
Commit
7377758
Β·
verified Β·
1 Parent(s): 72ad17f

Upload 6 files

Browse files
Files changed (4) hide show
  1. label_encoder.joblib +2 -2
  2. model.joblib +2 -2
  3. retrain.py +123 -0
  4. scaler.joblib +2 -2
label_encoder.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfe75e6d2a847020f3ca10fb09c74e2982dbf85616ef5c310225a4c13c6cee38
3
- size 508
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e85e8150b11b1cb1427578abea1ea4bf17dfff23a3ad701bc0f7ddd8f91db1cc
3
+ size 507
model.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27dfde6db68fa8cabdb5ddaec8257b094c6d03d921103446cd1a113dea00bdfc
3
- size 8236641
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd5e3179a6b0ccd2bbb8a63d58459ab436254c8d88a60eb2e26e68c5f98205b4
3
+ size 43222057
retrain.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ retrain.py β€” Firewall Log Classifier
3
+ Trains the Random Forest model on Dataset__log2_.csv and saves:
4
+ - model.joblib
5
+ - scaler.joblib
6
+ - label_encoder.joblib
7
+
8
+ Run: python retrain.py
9
+ Requires: pip install scikit-learn imbalanced-learn pandas joblib
10
+ """
11
+
12
+ import os
13
+ import pandas as pd
14
+ import numpy as np
15
+ import joblib
16
+ from sklearn.ensemble import RandomForestClassifier
17
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
18
+ from sklearn.model_selection import train_test_split
19
+ from sklearn.metrics import accuracy_score, f1_score, classification_report
20
+ from imblearn.over_sampling import SMOTE
21
+
22
+ # ── Config ────────────────────────────────────────────────────────────────────
23
+ DATASET_PATH = "Dataset__log2_.csv"
24
+
25
+ FEATURE_COLS = [
26
+ "Source Port",
27
+ "Destination Port",
28
+ "NAT Source Port",
29
+ "NAT Destination Port",
30
+ "Bytes",
31
+ "Bytes Sent",
32
+ "Bytes Received",
33
+ "Packets",
34
+ "Elapsed Time (sec)",
35
+ "pkts_sent",
36
+ "pkts_received",
37
+ ]
38
+
39
+ TARGET_COL = "Action"
40
+
41
+ OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
42
+
43
+ # ── 1. Load ───────────────────────────────────────────────────────────────────
44
+ print(f"Loading {DATASET_PATH} ...")
45
+ df = pd.read_csv(DATASET_PATH)
46
+ print(f" Raw rows: {len(df):,} Columns: {list(df.columns)}")
47
+
48
+ # ── 2. Verify columns exist ───────────────────────────────────────────────────
49
+ missing = [c for c in FEATURE_COLS + [TARGET_COL] if c not in df.columns]
50
+ if missing:
51
+ print("\nERROR β€” these columns are missing from the CSV:")
52
+ for m in missing:
53
+ print(f" '{m}'")
54
+ print("\nAvailable columns:", list(df.columns))
55
+ raise SystemExit(1)
56
+
57
+ # ── 3. Preprocessing ──────────────────────────────────────────────────────────
58
+ df.drop_duplicates(inplace=True)
59
+ print(f" After dedup: {len(df):,} rows")
60
+ # IQR filtering skipped β€” it eliminates the rare reset-both class entirely.
61
+ # SMOTE handles class imbalance instead.
62
+
63
+ X = df[FEATURE_COLS].values
64
+ y = df[TARGET_COL].values
65
+
66
+ print(f" Class distribution:\n{pd.Series(y).value_counts().to_string()}")
67
+
68
+ # ── 4. Encode labels ──────────────────────────────────────────────────────────
69
+ le = LabelEncoder()
70
+ y_enc = le.fit_transform(y)
71
+ print(f" Classes: {list(le.classes_)}")
72
+
73
+ # ── 5. Train / test split ─────────────────────────────────────────────────────
74
+ X_train, X_test, y_train, y_test = train_test_split(
75
+ X, y_enc, test_size=0.30, stratify=y_enc, random_state=42
76
+ )
77
+ print(f" Train: {len(X_train):,} Test: {len(X_test):,}")
78
+
79
+ # ── 6. SMOTE ──────────────────────────────────────────────────────────────────
80
+ print("Applying SMOTE ...")
81
+ sm = SMOTE(random_state=42)
82
+ X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
83
+ print(f" After SMOTE train size: {len(X_train_res):,}")
84
+
85
+ # ── 7. Scale ──────────────────────────────────────────────────────────────────
86
+ scaler = StandardScaler()
87
+ X_train_sc = scaler.fit_transform(X_train_res)
88
+ X_test_sc = scaler.transform(X_test)
89
+
90
+ # ── 8. Train tuned Random Forest ──────────────────────────────────────────────
91
+ print("Training Random Forest (n_estimators=200, max_depth=20) ...")
92
+ rf = RandomForestClassifier(
93
+ n_estimators=200,
94
+ max_depth=20,
95
+ min_samples_split=2,
96
+ random_state=42,
97
+ n_jobs=-1,
98
+ )
99
+ rf.fit(X_train_sc, y_train_res)
100
+
101
+ # ── 9. Evaluate ───────────────────────────────────────────────────────────────
102
+ y_pred = rf.predict(X_test_sc)
103
+ acc = accuracy_score(y_test, y_pred)
104
+ mf1 = f1_score(y_test, y_pred, average="macro")
105
+ print(f"\nTest Accuracy : {acc*100:.2f}%")
106
+ print(f"Macro F1 : {mf1:.4f}")
107
+ print("\nClassification Report:")
108
+ print(classification_report(y_test, y_pred, target_names=le.classes_))
109
+
110
+ # ── 10. Save artifacts ───��────────────────────────────────────────────────────
111
+ model_path = os.path.join(OUTPUT_DIR, "model.joblib")
112
+ scaler_path = os.path.join(OUTPUT_DIR, "scaler.joblib")
113
+ le_path = os.path.join(OUTPUT_DIR, "label_encoder.joblib")
114
+
115
+ joblib.dump(rf, model_path)
116
+ joblib.dump(scaler, scaler_path)
117
+ joblib.dump(le, le_path)
118
+
119
+ print(f"\nSaved:")
120
+ print(f" {model_path}")
121
+ print(f" {scaler_path}")
122
+ print(f" {le_path}")
123
+ print("\nDone! Upload these 3 files to your repo alongside app.py.")
scaler.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05ae874220fa3824577365214cfebea1e6b3cdca775a06bec5f74abe097feeb5
3
- size 863
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d52e80c3f5709cb89dd6358602bdf5574ed6fe350d5f0504c7e8d0ecd7314f68
3
+ size 831