Spaces:

muddasser
/

Anomoly_detection

Sleeping

File size: 4,767 Bytes

e8aaee5
1571ebf
e8aaee5

import pandas as pd

import numpy as np
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# ----------------------------
# STREAMLIT APP TITLE
# ----------------------------
st.title("🧠 NSL-KDD Anomaly Detection")
st.markdown("""
Compare **Isolation Forest**, **One-Class SVM**, and **Logistic Regression**  
for detecting network intrusions using the **NSL-KDD dataset**.
""")

# ----------------------------
# LOAD DATA
# ----------------------------
@st.cache_data
def load_data():
    url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
    col_names = [
        "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
        "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
        "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login",
        "is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
        "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
        "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
        "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"
    ]
    df = pd.read_csv(url, names=col_names)
    return df

df = load_data()
st.write("### Dataset Preview", df.head())

# ----------------------------
# DATA PREPROCESSING
# ----------------------------
# Drop unnecessary column
df = df.drop(columns=['difficulty'])

# One-hot encode categorical features
categorical_cols = ['protocol_type', 'service', 'flag']
df = pd.get_dummies(df, columns=categorical_cols)

# Encode labels (normal = 0, attack = 1)
df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1)

# Split features and labels
X = df.drop(columns=['label'])
y = df['label']

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

st.write("✅ Data successfully preprocessed and scaled.")
st.write("Feature shape:", X_scaled.shape)

# ----------------------------
# MODEL TRAINING
# ----------------------------
st.subheader("⚙️ Model Training and Evaluation")

models = {
    "Isolation Forest": IsolationForest(contamination=0.1, random_state=42),
    "One-Class SVM": OneClassSVM(nu=0.1, kernel='rbf', gamma='scale'),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

results = {}

for name, model in models.items():
    if name in ["Isolation Forest", "One-Class SVM"]:
        model.fit(X_scaled)
        preds = model.predict(X_scaled)
        # Convert -1 (anomaly) to 1 (attack), +1 to 0 (normal)
        preds = np.where(preds == -1, 1, 0)
    else:
        model.fit(X_scaled, y)
        preds = model.predict(X_scaled)
    
    acc = accuracy_score(y, preds)
    prec = precision_score(y, preds)
    rec = recall_score(y, preds)
    f1 = f1_score(y, preds)
    results[name] = [acc, prec, rec, f1]

# ----------------------------
# DISPLAY RESULTS
# ----------------------------
st.write("### 📊 Model Performance Comparison")

results_df = pd.DataFrame(results, index=['Accuracy', 'Precision', 'Recall', 'F1-Score']).T
st.dataframe(results_df.style.highlight_max(axis=0, color='lightgreen'))

# ----------------------------
# CONFUSION MATRICES
# ----------------------------
st.write("### 🔍 Confusion Matrices")

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, (name, model) in zip(axes, models.items()):
    if name in ["Isolation Forest", "One-Class SVM"]:
        model.fit(X_scaled)
        preds = model.predict(X_scaled)
        preds = np.where(preds == -1, 1, 0)
    else:
        preds = model.predict(X_scaled)
    cm = confusion_matrix(y, preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title(name)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
st.pyplot(fig)

# ----------------------------
# CONCLUSION
# ----------------------------
st.markdown("""
### 🧾 Summary
- **Isolation Forest** and **One-Class SVM** are *unsupervised* models — useful when labels are unknown.  
- **Logistic Regression** is *supervised* — it learns from labeled data.
- Typically, Isolation Forest performs better for anomaly detection on high-dimensional data.

📘 Dataset Source: [NSL-KDD Dataset](https://www.unb.ca/cic/datasets/nsl.html)
""")