File size: 4,767 Bytes
e8aaee5
1571ebf
e8aaee5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pandas as pd

import numpy as np
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# ----------------------------
# STREAMLIT APP TITLE
# ----------------------------
st.title("๐Ÿง  NSL-KDD Anomaly Detection")
st.markdown("""
Compare **Isolation Forest**, **One-Class SVM**, and **Logistic Regression**  
for detecting network intrusions using the **NSL-KDD dataset**.
""")

# ----------------------------
# LOAD DATA
# ----------------------------
@st.cache_data
def load_data():
    url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
    col_names = [
        "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
        "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
        "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login",
        "is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
        "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
        "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
        "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"
    ]
    df = pd.read_csv(url, names=col_names)
    return df

df = load_data()
st.write("### Dataset Preview", df.head())

# ----------------------------
# DATA PREPROCESSING
# ----------------------------
# Drop unnecessary column
df = df.drop(columns=['difficulty'])

# One-hot encode categorical features
categorical_cols = ['protocol_type', 'service', 'flag']
df = pd.get_dummies(df, columns=categorical_cols)

# Encode labels (normal = 0, attack = 1)
df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1)

# Split features and labels
X = df.drop(columns=['label'])
y = df['label']

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

st.write("โœ… Data successfully preprocessed and scaled.")
st.write("Feature shape:", X_scaled.shape)

# ----------------------------
# MODEL TRAINING
# ----------------------------
st.subheader("โš™๏ธ Model Training and Evaluation")

models = {
    "Isolation Forest": IsolationForest(contamination=0.1, random_state=42),
    "One-Class SVM": OneClassSVM(nu=0.1, kernel='rbf', gamma='scale'),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

results = {}

for name, model in models.items():
    if name in ["Isolation Forest", "One-Class SVM"]:
        model.fit(X_scaled)
        preds = model.predict(X_scaled)
        # Convert -1 (anomaly) to 1 (attack), +1 to 0 (normal)
        preds = np.where(preds == -1, 1, 0)
    else:
        model.fit(X_scaled, y)
        preds = model.predict(X_scaled)
    
    acc = accuracy_score(y, preds)
    prec = precision_score(y, preds)
    rec = recall_score(y, preds)
    f1 = f1_score(y, preds)
    results[name] = [acc, prec, rec, f1]

# ----------------------------
# DISPLAY RESULTS
# ----------------------------
st.write("### ๐Ÿ“Š Model Performance Comparison")

results_df = pd.DataFrame(results, index=['Accuracy', 'Precision', 'Recall', 'F1-Score']).T
st.dataframe(results_df.style.highlight_max(axis=0, color='lightgreen'))

# ----------------------------
# CONFUSION MATRICES
# ----------------------------
st.write("### ๐Ÿ” Confusion Matrices")

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, (name, model) in zip(axes, models.items()):
    if name in ["Isolation Forest", "One-Class SVM"]:
        model.fit(X_scaled)
        preds = model.predict(X_scaled)
        preds = np.where(preds == -1, 1, 0)
    else:
        preds = model.predict(X_scaled)
    cm = confusion_matrix(y, preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title(name)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
st.pyplot(fig)

# ----------------------------
# CONCLUSION
# ----------------------------
st.markdown("""
### ๐Ÿงพ Summary
- **Isolation Forest** and **One-Class SVM** are *unsupervised* models โ€” useful when labels are unknown.  
- **Logistic Regression** is *supervised* โ€” it learns from labeled data.
- Typically, Isolation Forest performs better for anomaly detection on high-dimensional data.

๐Ÿ“˜ Dataset Source: [NSL-KDD Dataset](https://www.unb.ca/cic/datasets/nsl.html)
""")