Update app.py
Browse files
app.py
CHANGED
|
@@ -1,17 +1,28 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import gdown
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
@st.cache_data
|
| 6 |
def load_dataset_view():
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
NB15_features_file_id = '1CgOl-fuxrluSxPMsL-vTuB4uPraTko-W'
|
| 9 |
NB15_1_file_id = '1letlWY_VIVLEkrfCexpNAysnPMRDXEbA'
|
| 10 |
NB15_2_file_id = '1QzwdKNEqDKGECWCNtz9K3DAWMMPpn2NN'
|
| 11 |
NB15_3_file_id = '19NV-RSuAD6F_zBDiDPZa5Pe5_Z7Sjynl'
|
| 12 |
NB15_4_file_id = '1_cOQOoqKthkSzevzqxBGHUpS-BFHuJPz'
|
| 13 |
|
| 14 |
-
#
|
| 15 |
urls = {
|
| 16 |
'NB15_features.csv': f'https://drive.google.com/uc?id={NB15_features_file_id}',
|
| 17 |
'NB15_1.csv': f'https://drive.google.com/uc?id={NB15_1_file_id}',
|
|
@@ -20,40 +31,144 @@ def load_dataset_view():
|
|
| 20 |
'NB15_4.csv': f'https://drive.google.com/uc?id={NB15_4_file_id}',
|
| 21 |
}
|
| 22 |
|
| 23 |
-
# Download
|
| 24 |
for filename, url in urls.items():
|
| 25 |
st.write(f"Downloading {filename}...")
|
| 26 |
gdown.download(url, filename, quiet=True)
|
| 27 |
|
| 28 |
-
# Load
|
| 29 |
NB15_features = pd.read_csv('NB15_features.csv', encoding='cp1252')
|
| 30 |
|
| 31 |
-
# Load datasets
|
| 32 |
NB15_1 = pd.read_csv('NB15_1.csv', dtype=str, low_memory=False)
|
| 33 |
NB15_2 = pd.read_csv('NB15_2.csv', dtype=str, low_memory=False)
|
| 34 |
NB15_3 = pd.read_csv('NB15_3.csv', dtype=str, low_memory=False)
|
| 35 |
NB15_4 = pd.read_csv('NB15_4.csv', dtype=str, low_memory=False)
|
| 36 |
|
| 37 |
-
# Assign feature names
|
| 38 |
NB15_1.columns = NB15_features['Name']
|
| 39 |
NB15_2.columns = NB15_features['Name']
|
| 40 |
NB15_3.columns = NB15_features['Name']
|
| 41 |
NB15_4.columns = NB15_features['Name']
|
| 42 |
|
| 43 |
-
# Concatenate
|
| 44 |
train_df = pd.concat([NB15_1, NB15_2, NB15_3, NB15_4], ignore_index=True)
|
| 45 |
return train_df
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
|
|
|
|
| 52 |
|
| 53 |
-
#
|
| 54 |
-
|
| 55 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
#
|
| 58 |
-
|
| 59 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import gdown
|
| 4 |
+
import pickle
|
| 5 |
+
import joblib
|
| 6 |
+
import xgboost as xgb
|
| 7 |
+
|
| 8 |
+
##############################################
|
| 9 |
+
# Helper Functions
|
| 10 |
+
##############################################
|
| 11 |
|
| 12 |
@st.cache_data
|
| 13 |
def load_dataset_view():
|
| 14 |
+
"""
|
| 15 |
+
Downloads the NB15 datasets and constructs a view of the raw training data.
|
| 16 |
+
Useful for inspecting how the system was built.
|
| 17 |
+
"""
|
| 18 |
+
# File IDs (NB15 and NB15_features)
|
| 19 |
NB15_features_file_id = '1CgOl-fuxrluSxPMsL-vTuB4uPraTko-W'
|
| 20 |
NB15_1_file_id = '1letlWY_VIVLEkrfCexpNAysnPMRDXEbA'
|
| 21 |
NB15_2_file_id = '1QzwdKNEqDKGECWCNtz9K3DAWMMPpn2NN'
|
| 22 |
NB15_3_file_id = '19NV-RSuAD6F_zBDiDPZa5Pe5_Z7Sjynl'
|
| 23 |
NB15_4_file_id = '1_cOQOoqKthkSzevzqxBGHUpS-BFHuJPz'
|
| 24 |
|
| 25 |
+
# Construct download URLs using Google Drive links
|
| 26 |
urls = {
|
| 27 |
'NB15_features.csv': f'https://drive.google.com/uc?id={NB15_features_file_id}',
|
| 28 |
'NB15_1.csv': f'https://drive.google.com/uc?id={NB15_1_file_id}',
|
|
|
|
| 31 |
'NB15_4.csv': f'https://drive.google.com/uc?id={NB15_4_file_id}',
|
| 32 |
}
|
| 33 |
|
| 34 |
+
# Download each file (progress messages help track the process)
|
| 35 |
for filename, url in urls.items():
|
| 36 |
st.write(f"Downloading {filename}...")
|
| 37 |
gdown.download(url, filename, quiet=True)
|
| 38 |
|
| 39 |
+
# Load NB15_features to assign proper column names
|
| 40 |
NB15_features = pd.read_csv('NB15_features.csv', encoding='cp1252')
|
| 41 |
|
| 42 |
+
# Load NB15 datasets as strings
|
| 43 |
NB15_1 = pd.read_csv('NB15_1.csv', dtype=str, low_memory=False)
|
| 44 |
NB15_2 = pd.read_csv('NB15_2.csv', dtype=str, low_memory=False)
|
| 45 |
NB15_3 = pd.read_csv('NB15_3.csv', dtype=str, low_memory=False)
|
| 46 |
NB15_4 = pd.read_csv('NB15_4.csv', dtype=str, low_memory=False)
|
| 47 |
|
| 48 |
+
# Assign proper feature names based on NB15_features
|
| 49 |
NB15_1.columns = NB15_features['Name']
|
| 50 |
NB15_2.columns = NB15_features['Name']
|
| 51 |
NB15_3.columns = NB15_features['Name']
|
| 52 |
NB15_4.columns = NB15_features['Name']
|
| 53 |
|
| 54 |
+
# Concatenate data from all four files to form one full dataset view
|
| 55 |
train_df = pd.concat([NB15_1, NB15_2, NB15_3, NB15_4], ignore_index=True)
|
| 56 |
return train_df
|
| 57 |
|
| 58 |
+
@st.cache_resource
|
| 59 |
+
def load_model_artifacts():
|
| 60 |
+
"""
|
| 61 |
+
Loads and returns the artifacts needed for testing the IDS:
|
| 62 |
+
- features_to_drop: The set of features dropped during training.
|
| 63 |
+
- category_encodings: The mapping for encoding categorical variables.
|
| 64 |
+
- xgb_model: The pre-trained XGBoost classifier.
|
| 65 |
+
"""
|
| 66 |
+
with open('features_to_drop.pkl', 'rb') as f:
|
| 67 |
+
features_to_drop = pickle.load(f)
|
| 68 |
+
with open('category_encodings.pkl', 'rb') as f:
|
| 69 |
+
category_encodings = pickle.load(f)
|
| 70 |
+
xgb_model = joblib.load('xgb_model.pkl')
|
| 71 |
+
return features_to_drop, category_encodings, xgb_model
|
| 72 |
+
|
| 73 |
+
def preprocess_input(df, features_to_drop, category_encodings):
|
| 74 |
+
"""
|
| 75 |
+
Preprocesses the incoming test DataFrame:
|
| 76 |
+
- Converts required columns to numeric.
|
| 77 |
+
- Computes engineered features: duration, byte_ratio, and pkt_ratio.
|
| 78 |
+
- Drops columns that were removed during model training.
|
| 79 |
+
- Encodes categorical variables using pre-saved mappings.
|
| 80 |
+
"""
|
| 81 |
+
df = df.copy()
|
| 82 |
+
expected_cols = ["Stime", "Ltime", "sbytes", "dbytes", "Spkts", "Dpkts"]
|
| 83 |
+
for col in expected_cols:
|
| 84 |
+
if col in df.columns:
|
| 85 |
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 86 |
+
else:
|
| 87 |
+
st.error(f"Missing required column: {col}")
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
# Create engineered features
|
| 91 |
+
df['duration'] = df['Ltime'] - df['Stime']
|
| 92 |
+
df['byte_ratio'] = df['sbytes'] / (df['dbytes'] + 1)
|
| 93 |
+
df['pkt_ratio'] = df['Spkts'] / (df['Dpkts'] + 1)
|
| 94 |
+
|
| 95 |
+
# Drop features (if present in the input) that were filtered during training
|
| 96 |
+
drop_cols = list(features_to_drop.intersection(set(df.columns)))
|
| 97 |
+
if drop_cols:
|
| 98 |
+
df = df.drop(columns=drop_cols)
|
| 99 |
+
|
| 100 |
+
# Encode categorical variables
|
| 101 |
+
for col, categories in category_encodings.items():
|
| 102 |
+
if col in df.columns:
|
| 103 |
+
df[col] = df[col].astype(str)
|
| 104 |
+
df[col] = pd.Categorical(df[col], categories=categories)
|
| 105 |
+
df[col] = df[col].cat.codes
|
| 106 |
+
df = df.fillna(0)
|
| 107 |
+
return df
|
| 108 |
+
|
| 109 |
+
##############################################
|
| 110 |
+
# Streamlit User Interface - Testing IDS
|
| 111 |
+
##############################################
|
| 112 |
+
|
| 113 |
+
st.set_page_config(page_title="Intrusion Detection System - Test", layout="wide")
|
| 114 |
+
st.title("Intrusion Detection System (IDS) - Testing Interface")
|
| 115 |
+
st.markdown(
|
| 116 |
+
"""
|
| 117 |
+
This interface allows you to test the Intrusion Detection System.
|
| 118 |
+
Upload your network traffic CSV file (structured like the training data) to see the system's predictions.
|
| 119 |
+
You can also view a sample of the training data to understand the features used in detection.
|
| 120 |
+
"""
|
| 121 |
+
)
|
| 122 |
|
| 123 |
+
# Create two sections: one for viewing sample/training data, and one for testing predictions.
|
| 124 |
+
tabs = st.tabs(["Sample Data", "Test IDS"])
|
| 125 |
|
| 126 |
+
# -------- Tab 1: Sample Data --------
|
| 127 |
+
with tabs[0]:
|
| 128 |
+
st.header("Sample Training Data")
|
| 129 |
+
st.markdown("Below is an overview of the underlying dataset used for training the IDS.")
|
| 130 |
+
df_view = load_dataset_view()
|
| 131 |
+
st.write("**Columns:**", df_view.columns.tolist())
|
| 132 |
+
st.write("**Shape:**", df_view.shape)
|
| 133 |
+
st.subheader("First 10 Rows of Training Data")
|
| 134 |
+
st.dataframe(df_view.head(10))
|
| 135 |
|
| 136 |
+
# -------- Tab 2: Test IDS --------
|
| 137 |
+
with tabs[1]:
|
| 138 |
+
st.header("Test the Intrusion Detection System")
|
| 139 |
+
st.markdown(
|
| 140 |
+
"""
|
| 141 |
+
**Instructions:**
|
| 142 |
+
- Upload a CSV file containing network traffic data. The file should include key columns such as:
|
| 143 |
+
`Stime, Ltime, sbytes, dbytes, Spkts, Dpkts` among others.
|
| 144 |
+
- The system will preprocess the data and output prediction labels that indicate possible intrusions.
|
| 145 |
+
"""
|
| 146 |
+
)
|
| 147 |
+
uploaded_file = st.file_uploader("Upload Network Traffic CSV", type=["csv"])
|
| 148 |
+
# Optionally, you can provide a button to load a sample test file if available.
|
| 149 |
+
if uploaded_file:
|
| 150 |
+
try:
|
| 151 |
+
test_df = pd.read_csv(uploaded_file)
|
| 152 |
+
st.subheader("Input Data Preview")
|
| 153 |
+
st.dataframe(test_df.head(10))
|
| 154 |
+
|
| 155 |
+
# Load the saved model artifacts needed for preprocessing and prediction.
|
| 156 |
+
features_to_drop, category_encodings, model = load_model_artifacts()
|
| 157 |
+
processed_test_df = preprocess_input(test_df, features_to_drop, category_encodings)
|
| 158 |
+
|
| 159 |
+
if processed_test_df is not None:
|
| 160 |
+
predictions = model.predict(processed_test_df)
|
| 161 |
+
st.subheader("IDS Predictions")
|
| 162 |
+
st.markdown(
|
| 163 |
+
"""
|
| 164 |
+
The prediction output is based on the model's encoding. Each row's prediction corresponds to an attack category.
|
| 165 |
+
For example, a prediction of 13 indicates a 'normal' traffic instance, whereas other values may indicate specific intrusion types.
|
| 166 |
+
"""
|
| 167 |
+
)
|
| 168 |
+
st.write(predictions)
|
| 169 |
+
else:
|
| 170 |
+
st.error("Preprocessing failed. Check that your data includes the necessary columns.")
|
| 171 |
+
except Exception as e:
|
| 172 |
+
st.error(f"An error occurred while processing the file: {e}")
|
| 173 |
+
else:
|
| 174 |
+
st.info("Awaiting CSV file upload for testing the Intrusion Detection System.")
|