Spaces:

muddasser
/

Anomoly_detection

Sleeping

App Files Files Community

muddasser commited on Oct 23, 2025

Commit

e8aaee5

verified ·

1 Parent(s): 5b93b6c

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -109

app.py CHANGED Viewed

@@ -1,109 +1,133 @@
-# ==========================================
-# 🧠 NSL-KDD Anomaly Detection Comparison
-# Isolation Forest vs One-Class SVM vs Logistic Regression
-# ==========================================
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
-from sklearn.preprocessing import LabelEncoder, StandardScaler
-from sklearn.ensemble import IsolationForest
-from sklearn.svm import OneClassSVM
-from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
-# ---------------------------
-# 1️⃣ Load Dataset
-# ---------------------------
-url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
-col_names = [
-    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
-    "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
-    "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
-    "count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
-    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
-    "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
-    "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label"
-]
-df = pd.read_csv(url, names=col_names)
-print("✅ Data Loaded Successfully:", df.shape)
-print(df.head())
-# ---------------------------
-# 2️⃣ Preprocessing
-# ---------------------------
-cat_cols = ["protocol_type", "service", "flag"]
-le = LabelEncoder()
-for col in cat_cols:
-    df[col] = le.fit_transform(df[col])
-# Convert labels: normal -> 0, attack -> 1
-df["label"] = df["label"].apply(lambda x: 0 if x == "normal" else 1)
-X = df.drop("label", axis=1)
-y = df["label"]
-# Scale features
-scaler = StandardScaler()
-X_scaled = scaler.fit_transform(X)
-# ---------------------------
-# 3️⃣ Train Models
-# ---------------------------
-# Isolation Forest (unsupervised)
-iso = IsolationForest(contamination=0.1, random_state=42)
-y_pred_iso = iso.fit_predict(X_scaled)
-y_pred_iso = np.where(y_pred_iso == 1, 0, 1)  # 1=normal -> 0, -1=anomaly -> 1
-# One-Class SVM (unsupervised)
-ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)
-y_pred_svm = ocsvm.fit_predict(X_scaled)
-y_pred_svm = np.where(y_pred_svm == 1, 0, 1)
-# Logistic Regression (supervised baseline)
-logreg = LogisticRegression(max_iter=2000)
-logreg.fit(X_scaled, y)
-y_pred_logreg = logreg.predict(X_scaled)
-# ---------------------------
-# 4️⃣ Evaluate Results
-# ---------------------------
-models = {
-    "Isolation Forest": y_pred_iso,
-    "One-Class SVM": y_pred_svm,
-    "Logistic Regression": y_pred_logreg
-}
-results = {}
-for name, preds in models.items():
-    acc = accuracy_score(y, preds)
-    results[name] = acc
-    print(f"\n🔹 {name} Results 🔹")
-    print(f"Accuracy: {acc:.4f}")
-    print("Confusion Matrix:\n", confusion_matrix(y, preds))
-    print("Classification Report:\n", classification_report(y, preds, target_names=["Normal", "Attack"]))
-# ---------------------------
-# 5️⃣ Compare Accuracies
-# ---------------------------
-plt.figure(figsize=(6,4))
-sns.barplot(x=list(results.keys()), y=list(results.values()), palette="viridis")
-plt.title("Model Accuracy Comparison on NSL-KDD Dataset")
-plt.ylabel("Accuracy")
-plt.ylim(0,1)
-plt.grid(alpha=0.3)
-plt.show()
-# ---------------------------
-# 6️⃣ Summary
-# ---------------------------
-print("\n✅ Summary:")
-for model, acc in results.items():
-    print(f"{model}: {acc:.4f}")
-print("\nIsolation Forest & One-Class SVM = Unsupervised models")
-print("Logistic Regression = Supervised baseline for comparison.")

+import pandas as pd
+import numpy as np
+import streamlit as st
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import IsolationForest
+from sklearn.svm import OneClassSVM
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
+# ----------------------------
+# STREAMLIT APP TITLE
+# ----------------------------
+st.title("🧠 NSL-KDD Anomaly Detection")
+st.markdown("""
+Compare **Isolation Forest**, **One-Class SVM**, and **Logistic Regression**
+for detecting network intrusions using the **NSL-KDD dataset**.
+""")
+# ----------------------------
+# LOAD DATA
+# ----------------------------
+@st.cache_data
+def load_data():
+    url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
+    col_names = [
+        "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
+        "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
+        "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login",
+        "is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
+        "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
+        "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
+        "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"
+    ]
+    df = pd.read_csv(url, names=col_names)
+    return df
+df = load_data()
+st.write("### Dataset Preview", df.head())
+# ----------------------------
+# DATA PREPROCESSING
+# ----------------------------
+# Drop unnecessary column
+df = df.drop(columns=['difficulty'])
+# One-hot encode categorical features
+categorical_cols = ['protocol_type', 'service', 'flag']
+df = pd.get_dummies(df, columns=categorical_cols)
+# Encode labels (normal = 0, attack = 1)
+df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1)
+# Split features and labels
+X = df.drop(columns=['label'])
+y = df['label']
+# Scale numeric features
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+st.write("✅ Data successfully preprocessed and scaled.")
+st.write("Feature shape:", X_scaled.shape)
+# ----------------------------
+# MODEL TRAINING
+# ----------------------------
+st.subheader("⚙️ Model Training and Evaluation")
+models = {
+    "Isolation Forest": IsolationForest(contamination=0.1, random_state=42),
+    "One-Class SVM": OneClassSVM(nu=0.1, kernel='rbf', gamma='scale'),
+    "Logistic Regression": LogisticRegression(max_iter=1000)
+}
+results = {}
+for name, model in models.items():
+    if name in ["Isolation Forest", "One-Class SVM"]:
+        model.fit(X_scaled)
+        preds = model.predict(X_scaled)
+        # Convert -1 (anomaly) to 1 (attack), +1 to 0 (normal)
+        preds = np.where(preds == -1, 1, 0)
+    else:
+        model.fit(X_scaled, y)
+        preds = model.predict(X_scaled)
+    acc = accuracy_score(y, preds)
+    prec = precision_score(y, preds)
+    rec = recall_score(y, preds)
+    f1 = f1_score(y, preds)
+    results[name] = [acc, prec, rec, f1]
+# ----------------------------
+# DISPLAY RESULTS
+# ----------------------------
+st.write("### 📊 Model Performance Comparison")
+results_df = pd.DataFrame(results, index=['Accuracy', 'Precision', 'Recall', 'F1-Score']).T
+st.dataframe(results_df.style.highlight_max(axis=0, color='lightgreen'))
+# ----------------------------
+# CONFUSION MATRICES
+# ----------------------------
+st.write("### 🔍 Confusion Matrices")
+fig, axes = plt.subplots(1, 3, figsize=(18, 5))
+for ax, (name, model) in zip(axes, models.items()):
+    if name in ["Isolation Forest", "One-Class SVM"]:
+        model.fit(X_scaled)
+        preds = model.predict(X_scaled)
+        preds = np.where(preds == -1, 1, 0)
+    else:
+        preds = model.predict(X_scaled)
+    cm = confusion_matrix(y, preds)
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
+    ax.set_title(name)
+    ax.set_xlabel('Predicted')
+    ax.set_ylabel('Actual')
+st.pyplot(fig)
+# ----------------------------
+# CONCLUSION
+# ----------------------------
+st.markdown("""
+### 🧾 Summary
+- **Isolation Forest** and **One-Class SVM** are *unsupervised* models — useful when labels are unknown.
+- **Logistic Regression** is *supervised* — it learns from labeled data.
+- Typically, Isolation Forest performs better for anomaly detection on high-dimensional data.
+📘 Dataset Source: [NSL-KDD Dataset](https://www.unb.ca/cic/datasets/nsl.html)
+""")