muddasser commited on
Commit
e8aaee5
·
verified ·
1 Parent(s): 5b93b6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -109
app.py CHANGED
@@ -1,109 +1,133 @@
1
- # ==========================================
2
- # 🧠 NSL-KDD Anomaly Detection Comparison
3
- # Isolation Forest vs One-Class SVM vs Logistic Regression
4
- # ==========================================
5
-
6
- import pandas as pd
7
- import numpy as np
8
- import matplotlib.pyplot as plt
9
- import seaborn as sns
10
- from sklearn.preprocessing import LabelEncoder, StandardScaler
11
- from sklearn.ensemble import IsolationForest
12
- from sklearn.svm import OneClassSVM
13
- from sklearn.linear_model import LogisticRegression
14
- from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
15
-
16
- # ---------------------------
17
- # 1️⃣ Load Dataset
18
- # ---------------------------
19
- url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
20
-
21
- col_names = [
22
- "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
23
- "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
24
- "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
25
- "count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
26
- "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
27
- "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
28
- "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label"
29
- ]
30
-
31
- df = pd.read_csv(url, names=col_names)
32
- print("✅ Data Loaded Successfully:", df.shape)
33
- print(df.head())
34
-
35
- # ---------------------------
36
- # 2️⃣ Preprocessing
37
- # ---------------------------
38
- cat_cols = ["protocol_type", "service", "flag"]
39
- le = LabelEncoder()
40
- for col in cat_cols:
41
- df[col] = le.fit_transform(df[col])
42
-
43
- # Convert labels: normal -> 0, attack -> 1
44
- df["label"] = df["label"].apply(lambda x: 0 if x == "normal" else 1)
45
-
46
- X = df.drop("label", axis=1)
47
- y = df["label"]
48
-
49
- # Scale features
50
- scaler = StandardScaler()
51
- X_scaled = scaler.fit_transform(X)
52
-
53
- # ---------------------------
54
- # 3️⃣ Train Models
55
- # ---------------------------
56
-
57
- # Isolation Forest (unsupervised)
58
- iso = IsolationForest(contamination=0.1, random_state=42)
59
- y_pred_iso = iso.fit_predict(X_scaled)
60
- y_pred_iso = np.where(y_pred_iso == 1, 0, 1) # 1=normal -> 0, -1=anomaly -> 1
61
-
62
- # One-Class SVM (unsupervised)
63
- ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)
64
- y_pred_svm = ocsvm.fit_predict(X_scaled)
65
- y_pred_svm = np.where(y_pred_svm == 1, 0, 1)
66
-
67
- # Logistic Regression (supervised baseline)
68
- logreg = LogisticRegression(max_iter=2000)
69
- logreg.fit(X_scaled, y)
70
- y_pred_logreg = logreg.predict(X_scaled)
71
-
72
- # ---------------------------
73
- # 4️⃣ Evaluate Results
74
- # ---------------------------
75
- models = {
76
- "Isolation Forest": y_pred_iso,
77
- "One-Class SVM": y_pred_svm,
78
- "Logistic Regression": y_pred_logreg
79
- }
80
-
81
- results = {}
82
-
83
- for name, preds in models.items():
84
- acc = accuracy_score(y, preds)
85
- results[name] = acc
86
- print(f"\n🔹 {name} Results 🔹")
87
- print(f"Accuracy: {acc:.4f}")
88
- print("Confusion Matrix:\n", confusion_matrix(y, preds))
89
- print("Classification Report:\n", classification_report(y, preds, target_names=["Normal", "Attack"]))
90
-
91
- # ---------------------------
92
- # 5️⃣ Compare Accuracies
93
- # ---------------------------
94
- plt.figure(figsize=(6,4))
95
- sns.barplot(x=list(results.keys()), y=list(results.values()), palette="viridis")
96
- plt.title("Model Accuracy Comparison on NSL-KDD Dataset")
97
- plt.ylabel("Accuracy")
98
- plt.ylim(0,1)
99
- plt.grid(alpha=0.3)
100
- plt.show()
101
-
102
- # ---------------------------
103
- # 6️⃣ Summary
104
- # ---------------------------
105
- print("\n✅ Summary:")
106
- for model, acc in results.items():
107
- print(f"{model}: {acc:.4f}")
108
- print("\nIsolation Forest & One-Class SVM = Unsupervised models")
109
- print("Logistic Regression = Supervised baseline for comparison.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.ensemble import IsolationForest
8
+ from sklearn.svm import OneClassSVM
9
+ from sklearn.linear_model import LogisticRegression
10
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
11
+
12
+ # ----------------------------
13
+ # STREAMLIT APP TITLE
14
+ # ----------------------------
15
+ st.title("🧠 NSL-KDD Anomaly Detection")
16
+ st.markdown("""
17
+ Compare **Isolation Forest**, **One-Class SVM**, and **Logistic Regression**
18
+ for detecting network intrusions using the **NSL-KDD dataset**.
19
+ """)
20
+
21
+ # ----------------------------
22
+ # LOAD DATA
23
+ # ----------------------------
24
+ @st.cache_data
25
+ def load_data():
26
+ url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
27
+ col_names = [
28
+ "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
29
+ "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
30
+ "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login",
31
+ "is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
32
+ "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
33
+ "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
34
+ "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"
35
+ ]
36
+ df = pd.read_csv(url, names=col_names)
37
+ return df
38
+
39
+ df = load_data()
40
+ st.write("### Dataset Preview", df.head())
41
+
42
+ # ----------------------------
43
+ # DATA PREPROCESSING
44
+ # ----------------------------
45
+ # Drop unnecessary column
46
+ df = df.drop(columns=['difficulty'])
47
+
48
+ # One-hot encode categorical features
49
+ categorical_cols = ['protocol_type', 'service', 'flag']
50
+ df = pd.get_dummies(df, columns=categorical_cols)
51
+
52
+ # Encode labels (normal = 0, attack = 1)
53
+ df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1)
54
+
55
+ # Split features and labels
56
+ X = df.drop(columns=['label'])
57
+ y = df['label']
58
+
59
+ # Scale numeric features
60
+ scaler = StandardScaler()
61
+ X_scaled = scaler.fit_transform(X)
62
+
63
+ st.write("✅ Data successfully preprocessed and scaled.")
64
+ st.write("Feature shape:", X_scaled.shape)
65
+
66
+ # ----------------------------
67
+ # MODEL TRAINING
68
+ # ----------------------------
69
+ st.subheader("⚙️ Model Training and Evaluation")
70
+
71
+ models = {
72
+ "Isolation Forest": IsolationForest(contamination=0.1, random_state=42),
73
+ "One-Class SVM": OneClassSVM(nu=0.1, kernel='rbf', gamma='scale'),
74
+ "Logistic Regression": LogisticRegression(max_iter=1000)
75
+ }
76
+
77
+ results = {}
78
+
79
+ for name, model in models.items():
80
+ if name in ["Isolation Forest", "One-Class SVM"]:
81
+ model.fit(X_scaled)
82
+ preds = model.predict(X_scaled)
83
+ # Convert -1 (anomaly) to 1 (attack), +1 to 0 (normal)
84
+ preds = np.where(preds == -1, 1, 0)
85
+ else:
86
+ model.fit(X_scaled, y)
87
+ preds = model.predict(X_scaled)
88
+
89
+ acc = accuracy_score(y, preds)
90
+ prec = precision_score(y, preds)
91
+ rec = recall_score(y, preds)
92
+ f1 = f1_score(y, preds)
93
+ results[name] = [acc, prec, rec, f1]
94
+
95
+ # ----------------------------
96
+ # DISPLAY RESULTS
97
+ # ----------------------------
98
+ st.write("### 📊 Model Performance Comparison")
99
+
100
+ results_df = pd.DataFrame(results, index=['Accuracy', 'Precision', 'Recall', 'F1-Score']).T
101
+ st.dataframe(results_df.style.highlight_max(axis=0, color='lightgreen'))
102
+
103
+ # ----------------------------
104
+ # CONFUSION MATRICES
105
+ # ----------------------------
106
+ st.write("### 🔍 Confusion Matrices")
107
+
108
+ fig, axes = plt.subplots(1, 3, figsize=(18, 5))
109
+ for ax, (name, model) in zip(axes, models.items()):
110
+ if name in ["Isolation Forest", "One-Class SVM"]:
111
+ model.fit(X_scaled)
112
+ preds = model.predict(X_scaled)
113
+ preds = np.where(preds == -1, 1, 0)
114
+ else:
115
+ preds = model.predict(X_scaled)
116
+ cm = confusion_matrix(y, preds)
117
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
118
+ ax.set_title(name)
119
+ ax.set_xlabel('Predicted')
120
+ ax.set_ylabel('Actual')
121
+ st.pyplot(fig)
122
+
123
+ # ----------------------------
124
+ # CONCLUSION
125
+ # ----------------------------
126
+ st.markdown("""
127
+ ### 🧾 Summary
128
+ - **Isolation Forest** and **One-Class SVM** are *unsupervised* models — useful when labels are unknown.
129
+ - **Logistic Regression** is *supervised* — it learns from labeled data.
130
+ - Typically, Isolation Forest performs better for anomaly detection on high-dimensional data.
131
+
132
+ 📘 Dataset Source: [NSL-KDD Dataset](https://www.unb.ca/cic/datasets/nsl.html)
133
+ """)