Spaces:

MrUtakata
/

ids

Sleeping

App Files Files Community

MrUtakata commited on Apr 12, 2025

Commit

95c089b

verified ·

1 Parent(s): 46a856e

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -17

app.py CHANGED Viewed

@@ -1,17 +1,28 @@
 import streamlit as st
 import pandas as pd
 import gdown
 @st.cache_data
 def load_dataset_view():
-    # File IDs
     NB15_features_file_id = '1CgOl-fuxrluSxPMsL-vTuB4uPraTko-W'
     NB15_1_file_id = '1letlWY_VIVLEkrfCexpNAysnPMRDXEbA'
     NB15_2_file_id = '1QzwdKNEqDKGECWCNtz9K3DAWMMPpn2NN'
     NB15_3_file_id = '19NV-RSuAD6F_zBDiDPZa5Pe5_Z7Sjynl'
     NB15_4_file_id = '1_cOQOoqKthkSzevzqxBGHUpS-BFHuJPz'
-    # Download URLs constructed with Google Drive sharing link format
     urls = {
         'NB15_features.csv': f'https://drive.google.com/uc?id={NB15_features_file_id}',
         'NB15_1.csv': f'https://drive.google.com/uc?id={NB15_1_file_id}',
@@ -20,40 +31,144 @@ def load_dataset_view():
         'NB15_4.csv': f'https://drive.google.com/uc?id={NB15_4_file_id}',
     }
-    # Download all necessary files
     for filename, url in urls.items():
         st.write(f"Downloading {filename}...")
         gdown.download(url, filename, quiet=True)
-    # Load features to assign proper column names
     NB15_features = pd.read_csv('NB15_features.csv', encoding='cp1252')
-    # Load datasets
     NB15_1 = pd.read_csv('NB15_1.csv', dtype=str, low_memory=False)
     NB15_2 = pd.read_csv('NB15_2.csv', dtype=str, low_memory=False)
     NB15_3 = pd.read_csv('NB15_3.csv', dtype=str, low_memory=False)
     NB15_4 = pd.read_csv('NB15_4.csv', dtype=str, low_memory=False)
-    # Assign feature names to each dataset
     NB15_1.columns = NB15_features['Name']
     NB15_2.columns = NB15_features['Name']
     NB15_3.columns = NB15_features['Name']
     NB15_4.columns = NB15_features['Name']
-    # Concatenate the datasets into a single DataFrame for a full view
     train_df = pd.concat([NB15_1, NB15_2, NB15_3, NB15_4], ignore_index=True)
     return train_df
-# --- Streamlit UI for "Intrusion Detection System" Dataset View ---
-st.title("Intrusion Detection System")
-st.header("Dataset View")
-df = load_dataset_view()
-# Display general information about the dataset
-st.write("**Dataset Columns:**", df.columns.tolist())
-st.write("**Dataset Shape:**", df.shape)
-# Display a sample of the dataset
-st.subheader("First 10 Rows of the Dataset")
-st.dataframe(df.head(10))

 import streamlit as st
 import pandas as pd
 import gdown
+import pickle
+import joblib
+import xgboost as xgb
+##############################################
+# Helper Functions
+##############################################
 @st.cache_data
 def load_dataset_view():
+    """
+    Downloads the NB15 datasets and constructs a view of the raw training data.
+    Useful for inspecting how the system was built.
+    """
+    # File IDs (NB15 and NB15_features)
     NB15_features_file_id = '1CgOl-fuxrluSxPMsL-vTuB4uPraTko-W'
     NB15_1_file_id = '1letlWY_VIVLEkrfCexpNAysnPMRDXEbA'
     NB15_2_file_id = '1QzwdKNEqDKGECWCNtz9K3DAWMMPpn2NN'
     NB15_3_file_id = '19NV-RSuAD6F_zBDiDPZa5Pe5_Z7Sjynl'
     NB15_4_file_id = '1_cOQOoqKthkSzevzqxBGHUpS-BFHuJPz'
+    # Construct download URLs using Google Drive links
     urls = {
         'NB15_features.csv': f'https://drive.google.com/uc?id={NB15_features_file_id}',
         'NB15_1.csv': f'https://drive.google.com/uc?id={NB15_1_file_id}',
         'NB15_4.csv': f'https://drive.google.com/uc?id={NB15_4_file_id}',
     }
+    # Download each file (progress messages help track the process)
     for filename, url in urls.items():
         st.write(f"Downloading {filename}...")
         gdown.download(url, filename, quiet=True)
+    # Load NB15_features to assign proper column names
     NB15_features = pd.read_csv('NB15_features.csv', encoding='cp1252')
+    # Load NB15 datasets as strings
     NB15_1 = pd.read_csv('NB15_1.csv', dtype=str, low_memory=False)
     NB15_2 = pd.read_csv('NB15_2.csv', dtype=str, low_memory=False)
     NB15_3 = pd.read_csv('NB15_3.csv', dtype=str, low_memory=False)
     NB15_4 = pd.read_csv('NB15_4.csv', dtype=str, low_memory=False)
+    # Assign proper feature names based on NB15_features
     NB15_1.columns = NB15_features['Name']
     NB15_2.columns = NB15_features['Name']
     NB15_3.columns = NB15_features['Name']
     NB15_4.columns = NB15_features['Name']
+    # Concatenate data from all four files to form one full dataset view
     train_df = pd.concat([NB15_1, NB15_2, NB15_3, NB15_4], ignore_index=True)
     return train_df
+@st.cache_resource
+def load_model_artifacts():
+    """
+    Loads and returns the artifacts needed for testing the IDS:
+      - features_to_drop: The set of features dropped during training.
+      - category_encodings: The mapping for encoding categorical variables.
+      - xgb_model: The pre-trained XGBoost classifier.
+    """
+    with open('features_to_drop.pkl', 'rb') as f:
+        features_to_drop = pickle.load(f)
+    with open('category_encodings.pkl', 'rb') as f:
+        category_encodings = pickle.load(f)
+    xgb_model = joblib.load('xgb_model.pkl')
+    return features_to_drop, category_encodings, xgb_model
+def preprocess_input(df, features_to_drop, category_encodings):
+    """
+    Preprocesses the incoming test DataFrame:
+      - Converts required columns to numeric.
+      - Computes engineered features: duration, byte_ratio, and pkt_ratio.
+      - Drops columns that were removed during model training.
+      - Encodes categorical variables using pre-saved mappings.
+    """
+    df = df.copy()
+    expected_cols = ["Stime", "Ltime", "sbytes", "dbytes", "Spkts", "Dpkts"]
+    for col in expected_cols:
+        if col in df.columns:
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+        else:
+            st.error(f"Missing required column: {col}")
+            return None
+    # Create engineered features
+    df['duration'] = df['Ltime'] - df['Stime']
+    df['byte_ratio'] = df['sbytes'] / (df['dbytes'] + 1)
+    df['pkt_ratio'] = df['Spkts'] / (df['Dpkts'] + 1)
+    # Drop features (if present in the input) that were filtered during training
+    drop_cols = list(features_to_drop.intersection(set(df.columns)))
+    if drop_cols:
+        df = df.drop(columns=drop_cols)
+    # Encode categorical variables
+    for col, categories in category_encodings.items():
+        if col in df.columns:
+            df[col] = df[col].astype(str)
+            df[col] = pd.Categorical(df[col], categories=categories)
+            df[col] = df[col].cat.codes
+    df = df.fillna(0)
+    return df
+##############################################
+# Streamlit User Interface - Testing IDS
+##############################################
+st.set_page_config(page_title="Intrusion Detection System - Test", layout="wide")
+st.title("Intrusion Detection System (IDS) - Testing Interface")
+st.markdown(
+    """
+    This interface allows you to test the Intrusion Detection System.
+    Upload your network traffic CSV file (structured like the training data) to see the system's predictions.
+    You can also view a sample of the training data to understand the features used in detection.
+    """
+)
+# Create two sections: one for viewing sample/training data, and one for testing predictions.
+tabs = st.tabs(["Sample Data", "Test IDS"])
+# -------- Tab 1: Sample Data --------
+with tabs[0]:
+    st.header("Sample Training Data")
+    st.markdown("Below is an overview of the underlying dataset used for training the IDS.")
+    df_view = load_dataset_view()
+    st.write("**Columns:**", df_view.columns.tolist())
+    st.write("**Shape:**", df_view.shape)
+    st.subheader("First 10 Rows of Training Data")
+    st.dataframe(df_view.head(10))
+# -------- Tab 2: Test IDS --------
+with tabs[1]:
+    st.header("Test the Intrusion Detection System")
+    st.markdown(
+        """
+        **Instructions:**
+        - Upload a CSV file containing network traffic data. The file should include key columns such as:
+          `Stime, Ltime, sbytes, dbytes, Spkts, Dpkts` among others.
+        - The system will preprocess the data and output prediction labels that indicate possible intrusions.
+        """
+    )
+    uploaded_file = st.file_uploader("Upload Network Traffic CSV", type=["csv"])
+    # Optionally, you can provide a button to load a sample test file if available.
+    if uploaded_file:
+        try:
+            test_df = pd.read_csv(uploaded_file)
+            st.subheader("Input Data Preview")
+            st.dataframe(test_df.head(10))
+            # Load the saved model artifacts needed for preprocessing and prediction.
+            features_to_drop, category_encodings, model = load_model_artifacts()
+            processed_test_df = preprocess_input(test_df, features_to_drop, category_encodings)
+            if processed_test_df is not None:
+                predictions = model.predict(processed_test_df)
+                st.subheader("IDS Predictions")
+                st.markdown(
+                    """
+                    The prediction output is based on the model's encoding. Each row's prediction corresponds to an attack category.
+                    For example, a prediction of 13 indicates a 'normal' traffic instance, whereas other values may indicate specific intrusion types.
+                    """
+                )
+                st.write(predictions)
+            else:
+                st.error("Preprocessing failed. Check that your data includes the necessary columns.")
+        except Exception as e:
+            st.error(f"An error occurred while processing the file: {e}")
+    else:
+        st.info("Awaiting CSV file upload for testing the Intrusion Detection System.")