Spaces:
Sleeping
Sleeping
File size: 4,767 Bytes
e8aaee5 1571ebf e8aaee5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | import pandas as pd
import numpy as np
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# ----------------------------
# STREAMLIT APP TITLE
# ----------------------------
st.title("๐ง NSL-KDD Anomaly Detection")
st.markdown("""
Compare **Isolation Forest**, **One-Class SVM**, and **Logistic Regression**
for detecting network intrusions using the **NSL-KDD dataset**.
""")
# ----------------------------
# LOAD DATA
# ----------------------------
@st.cache_data
def load_data():
url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
col_names = [
"duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
"hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
"num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login",
"is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
"same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
"dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
"dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"
]
df = pd.read_csv(url, names=col_names)
return df
df = load_data()
st.write("### Dataset Preview", df.head())
# ----------------------------
# DATA PREPROCESSING
# ----------------------------
# Drop unnecessary column
df = df.drop(columns=['difficulty'])
# One-hot encode categorical features
categorical_cols = ['protocol_type', 'service', 'flag']
df = pd.get_dummies(df, columns=categorical_cols)
# Encode labels (normal = 0, attack = 1)
df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1)
# Split features and labels
X = df.drop(columns=['label'])
y = df['label']
# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
st.write("โ
Data successfully preprocessed and scaled.")
st.write("Feature shape:", X_scaled.shape)
# ----------------------------
# MODEL TRAINING
# ----------------------------
st.subheader("โ๏ธ Model Training and Evaluation")
models = {
"Isolation Forest": IsolationForest(contamination=0.1, random_state=42),
"One-Class SVM": OneClassSVM(nu=0.1, kernel='rbf', gamma='scale'),
"Logistic Regression": LogisticRegression(max_iter=1000)
}
results = {}
for name, model in models.items():
if name in ["Isolation Forest", "One-Class SVM"]:
model.fit(X_scaled)
preds = model.predict(X_scaled)
# Convert -1 (anomaly) to 1 (attack), +1 to 0 (normal)
preds = np.where(preds == -1, 1, 0)
else:
model.fit(X_scaled, y)
preds = model.predict(X_scaled)
acc = accuracy_score(y, preds)
prec = precision_score(y, preds)
rec = recall_score(y, preds)
f1 = f1_score(y, preds)
results[name] = [acc, prec, rec, f1]
# ----------------------------
# DISPLAY RESULTS
# ----------------------------
st.write("### ๐ Model Performance Comparison")
results_df = pd.DataFrame(results, index=['Accuracy', 'Precision', 'Recall', 'F1-Score']).T
st.dataframe(results_df.style.highlight_max(axis=0, color='lightgreen'))
# ----------------------------
# CONFUSION MATRICES
# ----------------------------
st.write("### ๐ Confusion Matrices")
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, (name, model) in zip(axes, models.items()):
if name in ["Isolation Forest", "One-Class SVM"]:
model.fit(X_scaled)
preds = model.predict(X_scaled)
preds = np.where(preds == -1, 1, 0)
else:
preds = model.predict(X_scaled)
cm = confusion_matrix(y, preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title(name)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
st.pyplot(fig)
# ----------------------------
# CONCLUSION
# ----------------------------
st.markdown("""
### ๐งพ Summary
- **Isolation Forest** and **One-Class SVM** are *unsupervised* models โ useful when labels are unknown.
- **Logistic Regression** is *supervised* โ it learns from labeled data.
- Typically, Isolation Forest performs better for anomaly detection on high-dimensional data.
๐ Dataset Source: [NSL-KDD Dataset](https://www.unb.ca/cic/datasets/nsl.html)
""")
|