Spaces:
Running
Running
| import pandas as pd | |
| import numpy as np | |
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.ensemble import IsolationForest | |
| from sklearn.svm import OneClassSVM | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix | |
| # ---------------------------- | |
| # STREAMLIT APP TITLE | |
| # ---------------------------- | |
| st.title("π§ NSL-KDD Anomaly Detection") | |
| st.markdown(""" | |
| Compare **Isolation Forest**, **One-Class SVM**, and **Logistic Regression** | |
| for detecting network intrusions using the **NSL-KDD dataset**. | |
| """) | |
| # ---------------------------- | |
| # LOAD DATA | |
| # ---------------------------- | |
| def load_data(): | |
| url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt" | |
| col_names = [ | |
| "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent", | |
| "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root", | |
| "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login", | |
| "is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate", | |
| "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", | |
| "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate", | |
| "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty" | |
| ] | |
| df = pd.read_csv(url, names=col_names) | |
| return df | |
| df = load_data() | |
| st.write("### Dataset Preview", df.head()) | |
| # ---------------------------- | |
| # DATA PREPROCESSING | |
| # ---------------------------- | |
| # Drop unnecessary column | |
| df = df.drop(columns=['difficulty']) | |
| # One-hot encode categorical features | |
| categorical_cols = ['protocol_type', 'service', 'flag'] | |
| df = pd.get_dummies(df, columns=categorical_cols) | |
| # Encode labels (normal = 0, attack = 1) | |
| df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1) | |
| # Split features and labels | |
| X = df.drop(columns=['label']) | |
| y = df['label'] | |
| # Scale numeric features | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| st.write("β Data successfully preprocessed and scaled.") | |
| st.write("Feature shape:", X_scaled.shape) | |
| # ---------------------------- | |
| # MODEL TRAINING | |
| # ---------------------------- | |
| st.subheader("βοΈ Model Training and Evaluation") | |
| models = { | |
| "Isolation Forest": IsolationForest(contamination=0.1, random_state=42), | |
| "One-Class SVM": OneClassSVM(nu=0.1, kernel='rbf', gamma='scale'), | |
| "Logistic Regression": LogisticRegression(max_iter=1000) | |
| } | |
| results = {} | |
| for name, model in models.items(): | |
| if name in ["Isolation Forest", "One-Class SVM"]: | |
| model.fit(X_scaled) | |
| preds = model.predict(X_scaled) | |
| # Convert -1 (anomaly) to 1 (attack), +1 to 0 (normal) | |
| preds = np.where(preds == -1, 1, 0) | |
| else: | |
| model.fit(X_scaled, y) | |
| preds = model.predict(X_scaled) | |
| acc = accuracy_score(y, preds) | |
| prec = precision_score(y, preds) | |
| rec = recall_score(y, preds) | |
| f1 = f1_score(y, preds) | |
| results[name] = [acc, prec, rec, f1] | |
| # ---------------------------- | |
| # DISPLAY RESULTS | |
| # ---------------------------- | |
| st.write("### π Model Performance Comparison") | |
| results_df = pd.DataFrame(results, index=['Accuracy', 'Precision', 'Recall', 'F1-Score']).T | |
| st.dataframe(results_df.style.highlight_max(axis=0, color='lightgreen')) | |
| # ---------------------------- | |
| # CONFUSION MATRICES | |
| # ---------------------------- | |
| st.write("### π Confusion Matrices") | |
| fig, axes = plt.subplots(1, 3, figsize=(18, 5)) | |
| for ax, (name, model) in zip(axes, models.items()): | |
| if name in ["Isolation Forest", "One-Class SVM"]: | |
| model.fit(X_scaled) | |
| preds = model.predict(X_scaled) | |
| preds = np.where(preds == -1, 1, 0) | |
| else: | |
| preds = model.predict(X_scaled) | |
| cm = confusion_matrix(y, preds) | |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) | |
| ax.set_title(name) | |
| ax.set_xlabel('Predicted') | |
| ax.set_ylabel('Actual') | |
| st.pyplot(fig) | |
| # ---------------------------- | |
| # CONCLUSION | |
| # ---------------------------- | |
| st.markdown(""" | |
| ### π§Ύ Summary | |
| - **Isolation Forest** and **One-Class SVM** are *unsupervised* models β useful when labels are unknown. | |
| - **Logistic Regression** is *supervised* β it learns from labeled data. | |
| - Typically, Isolation Forest performs better for anomaly detection on high-dimensional data. | |
| π Dataset Source: [NSL-KDD Dataset](https://www.unb.ca/cic/datasets/nsl.html) | |
| """) | |