import pandas as pd import numpy as np import streamlit as st import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler from sklearn.ensemble import IsolationForest from sklearn.svm import OneClassSVM from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix # ---------------------------- # STREAMLIT APP TITLE # ---------------------------- st.title("๐Ÿง  NSL-KDD Anomaly Detection") st.markdown(""" Compare **Isolation Forest**, **One-Class SVM**, and **Logistic Regression** for detecting network intrusions using the **NSL-KDD dataset**. """) # ---------------------------- # LOAD DATA # ---------------------------- @st.cache_data def load_data(): url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt" col_names = [ "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent", "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root", "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login", "is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate", "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate", "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty" ] df = pd.read_csv(url, names=col_names) return df df = load_data() st.write("### Dataset Preview", df.head()) # ---------------------------- # DATA PREPROCESSING # ---------------------------- # Drop unnecessary column df = df.drop(columns=['difficulty']) # One-hot encode categorical features categorical_cols = ['protocol_type', 'service', 'flag'] df = pd.get_dummies(df, columns=categorical_cols) # Encode labels (normal = 0, attack = 1) df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1) # Split features and labels X = df.drop(columns=['label']) y = df['label'] # Scale numeric features scaler = StandardScaler() X_scaled = scaler.fit_transform(X) st.write("โœ… Data successfully preprocessed and scaled.") st.write("Feature shape:", X_scaled.shape) # ---------------------------- # MODEL TRAINING # ---------------------------- st.subheader("โš™๏ธ Model Training and Evaluation") models = { "Isolation Forest": IsolationForest(contamination=0.1, random_state=42), "One-Class SVM": OneClassSVM(nu=0.1, kernel='rbf', gamma='scale'), "Logistic Regression": LogisticRegression(max_iter=1000) } results = {} for name, model in models.items(): if name in ["Isolation Forest", "One-Class SVM"]: model.fit(X_scaled) preds = model.predict(X_scaled) # Convert -1 (anomaly) to 1 (attack), +1 to 0 (normal) preds = np.where(preds == -1, 1, 0) else: model.fit(X_scaled, y) preds = model.predict(X_scaled) acc = accuracy_score(y, preds) prec = precision_score(y, preds) rec = recall_score(y, preds) f1 = f1_score(y, preds) results[name] = [acc, prec, rec, f1] # ---------------------------- # DISPLAY RESULTS # ---------------------------- st.write("### ๐Ÿ“Š Model Performance Comparison") results_df = pd.DataFrame(results, index=['Accuracy', 'Precision', 'Recall', 'F1-Score']).T st.dataframe(results_df.style.highlight_max(axis=0, color='lightgreen')) # ---------------------------- # CONFUSION MATRICES # ---------------------------- st.write("### ๐Ÿ” Confusion Matrices") fig, axes = plt.subplots(1, 3, figsize=(18, 5)) for ax, (name, model) in zip(axes, models.items()): if name in ["Isolation Forest", "One-Class SVM"]: model.fit(X_scaled) preds = model.predict(X_scaled) preds = np.where(preds == -1, 1, 0) else: preds = model.predict(X_scaled) cm = confusion_matrix(y, preds) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) ax.set_title(name) ax.set_xlabel('Predicted') ax.set_ylabel('Actual') st.pyplot(fig) # ---------------------------- # CONCLUSION # ---------------------------- st.markdown(""" ### ๐Ÿงพ Summary - **Isolation Forest** and **One-Class SVM** are *unsupervised* models โ€” useful when labels are unknown. - **Logistic Regression** is *supervised* โ€” it learns from labeled data. - Typically, Isolation Forest performs better for anomaly detection on high-dimensional data. ๐Ÿ“˜ Dataset Source: [NSL-KDD Dataset](https://www.unb.ca/cic/datasets/nsl.html) """)