MrUtakata commited on
Commit
95c089b
·
verified ·
1 Parent(s): 46a856e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -17
app.py CHANGED
@@ -1,17 +1,28 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import gdown
 
 
 
 
 
 
 
4
 
5
  @st.cache_data
6
  def load_dataset_view():
7
- # File IDs
 
 
 
 
8
  NB15_features_file_id = '1CgOl-fuxrluSxPMsL-vTuB4uPraTko-W'
9
  NB15_1_file_id = '1letlWY_VIVLEkrfCexpNAysnPMRDXEbA'
10
  NB15_2_file_id = '1QzwdKNEqDKGECWCNtz9K3DAWMMPpn2NN'
11
  NB15_3_file_id = '19NV-RSuAD6F_zBDiDPZa5Pe5_Z7Sjynl'
12
  NB15_4_file_id = '1_cOQOoqKthkSzevzqxBGHUpS-BFHuJPz'
13
 
14
- # Download URLs constructed with Google Drive sharing link format
15
  urls = {
16
  'NB15_features.csv': f'https://drive.google.com/uc?id={NB15_features_file_id}',
17
  'NB15_1.csv': f'https://drive.google.com/uc?id={NB15_1_file_id}',
@@ -20,40 +31,144 @@ def load_dataset_view():
20
  'NB15_4.csv': f'https://drive.google.com/uc?id={NB15_4_file_id}',
21
  }
22
 
23
- # Download all necessary files
24
  for filename, url in urls.items():
25
  st.write(f"Downloading {filename}...")
26
  gdown.download(url, filename, quiet=True)
27
 
28
- # Load features to assign proper column names
29
  NB15_features = pd.read_csv('NB15_features.csv', encoding='cp1252')
30
 
31
- # Load datasets
32
  NB15_1 = pd.read_csv('NB15_1.csv', dtype=str, low_memory=False)
33
  NB15_2 = pd.read_csv('NB15_2.csv', dtype=str, low_memory=False)
34
  NB15_3 = pd.read_csv('NB15_3.csv', dtype=str, low_memory=False)
35
  NB15_4 = pd.read_csv('NB15_4.csv', dtype=str, low_memory=False)
36
 
37
- # Assign feature names to each dataset
38
  NB15_1.columns = NB15_features['Name']
39
  NB15_2.columns = NB15_features['Name']
40
  NB15_3.columns = NB15_features['Name']
41
  NB15_4.columns = NB15_features['Name']
42
 
43
- # Concatenate the datasets into a single DataFrame for a full view
44
  train_df = pd.concat([NB15_1, NB15_2, NB15_3, NB15_4], ignore_index=True)
45
  return train_df
46
 
47
- # --- Streamlit UI for "Intrusion Detection System" Dataset View ---
48
- st.title("Intrusion Detection System")
49
- st.header("Dataset View")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- df = load_dataset_view()
 
52
 
53
- # Display general information about the dataset
54
- st.write("**Dataset Columns:**", df.columns.tolist())
55
- st.write("**Dataset Shape:**", df.shape)
 
 
 
 
 
 
56
 
57
- # Display a sample of the dataset
58
- st.subheader("First 10 Rows of the Dataset")
59
- st.dataframe(df.head(10))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import gdown
4
+ import pickle
5
+ import joblib
6
+ import xgboost as xgb
7
+
8
+ ##############################################
9
+ # Helper Functions
10
+ ##############################################
11
 
12
  @st.cache_data
13
  def load_dataset_view():
14
+ """
15
+ Downloads the NB15 datasets and constructs a view of the raw training data.
16
+ Useful for inspecting how the system was built.
17
+ """
18
+ # File IDs (NB15 and NB15_features)
19
  NB15_features_file_id = '1CgOl-fuxrluSxPMsL-vTuB4uPraTko-W'
20
  NB15_1_file_id = '1letlWY_VIVLEkrfCexpNAysnPMRDXEbA'
21
  NB15_2_file_id = '1QzwdKNEqDKGECWCNtz9K3DAWMMPpn2NN'
22
  NB15_3_file_id = '19NV-RSuAD6F_zBDiDPZa5Pe5_Z7Sjynl'
23
  NB15_4_file_id = '1_cOQOoqKthkSzevzqxBGHUpS-BFHuJPz'
24
 
25
+ # Construct download URLs using Google Drive links
26
  urls = {
27
  'NB15_features.csv': f'https://drive.google.com/uc?id={NB15_features_file_id}',
28
  'NB15_1.csv': f'https://drive.google.com/uc?id={NB15_1_file_id}',
 
31
  'NB15_4.csv': f'https://drive.google.com/uc?id={NB15_4_file_id}',
32
  }
33
 
34
+ # Download each file (progress messages help track the process)
35
  for filename, url in urls.items():
36
  st.write(f"Downloading {filename}...")
37
  gdown.download(url, filename, quiet=True)
38
 
39
+ # Load NB15_features to assign proper column names
40
  NB15_features = pd.read_csv('NB15_features.csv', encoding='cp1252')
41
 
42
+ # Load NB15 datasets as strings
43
  NB15_1 = pd.read_csv('NB15_1.csv', dtype=str, low_memory=False)
44
  NB15_2 = pd.read_csv('NB15_2.csv', dtype=str, low_memory=False)
45
  NB15_3 = pd.read_csv('NB15_3.csv', dtype=str, low_memory=False)
46
  NB15_4 = pd.read_csv('NB15_4.csv', dtype=str, low_memory=False)
47
 
48
+ # Assign proper feature names based on NB15_features
49
  NB15_1.columns = NB15_features['Name']
50
  NB15_2.columns = NB15_features['Name']
51
  NB15_3.columns = NB15_features['Name']
52
  NB15_4.columns = NB15_features['Name']
53
 
54
+ # Concatenate data from all four files to form one full dataset view
55
  train_df = pd.concat([NB15_1, NB15_2, NB15_3, NB15_4], ignore_index=True)
56
  return train_df
57
 
58
+ @st.cache_resource
59
+ def load_model_artifacts():
60
+ """
61
+ Loads and returns the artifacts needed for testing the IDS:
62
+ - features_to_drop: The set of features dropped during training.
63
+ - category_encodings: The mapping for encoding categorical variables.
64
+ - xgb_model: The pre-trained XGBoost classifier.
65
+ """
66
+ with open('features_to_drop.pkl', 'rb') as f:
67
+ features_to_drop = pickle.load(f)
68
+ with open('category_encodings.pkl', 'rb') as f:
69
+ category_encodings = pickle.load(f)
70
+ xgb_model = joblib.load('xgb_model.pkl')
71
+ return features_to_drop, category_encodings, xgb_model
72
+
73
+ def preprocess_input(df, features_to_drop, category_encodings):
74
+ """
75
+ Preprocesses the incoming test DataFrame:
76
+ - Converts required columns to numeric.
77
+ - Computes engineered features: duration, byte_ratio, and pkt_ratio.
78
+ - Drops columns that were removed during model training.
79
+ - Encodes categorical variables using pre-saved mappings.
80
+ """
81
+ df = df.copy()
82
+ expected_cols = ["Stime", "Ltime", "sbytes", "dbytes", "Spkts", "Dpkts"]
83
+ for col in expected_cols:
84
+ if col in df.columns:
85
+ df[col] = pd.to_numeric(df[col], errors='coerce')
86
+ else:
87
+ st.error(f"Missing required column: {col}")
88
+ return None
89
+
90
+ # Create engineered features
91
+ df['duration'] = df['Ltime'] - df['Stime']
92
+ df['byte_ratio'] = df['sbytes'] / (df['dbytes'] + 1)
93
+ df['pkt_ratio'] = df['Spkts'] / (df['Dpkts'] + 1)
94
+
95
+ # Drop features (if present in the input) that were filtered during training
96
+ drop_cols = list(features_to_drop.intersection(set(df.columns)))
97
+ if drop_cols:
98
+ df = df.drop(columns=drop_cols)
99
+
100
+ # Encode categorical variables
101
+ for col, categories in category_encodings.items():
102
+ if col in df.columns:
103
+ df[col] = df[col].astype(str)
104
+ df[col] = pd.Categorical(df[col], categories=categories)
105
+ df[col] = df[col].cat.codes
106
+ df = df.fillna(0)
107
+ return df
108
+
109
+ ##############################################
110
+ # Streamlit User Interface - Testing IDS
111
+ ##############################################
112
+
113
+ st.set_page_config(page_title="Intrusion Detection System - Test", layout="wide")
114
+ st.title("Intrusion Detection System (IDS) - Testing Interface")
115
+ st.markdown(
116
+ """
117
+ This interface allows you to test the Intrusion Detection System.
118
+ Upload your network traffic CSV file (structured like the training data) to see the system's predictions.
119
+ You can also view a sample of the training data to understand the features used in detection.
120
+ """
121
+ )
122
 
123
+ # Create two sections: one for viewing sample/training data, and one for testing predictions.
124
+ tabs = st.tabs(["Sample Data", "Test IDS"])
125
 
126
+ # -------- Tab 1: Sample Data --------
127
+ with tabs[0]:
128
+ st.header("Sample Training Data")
129
+ st.markdown("Below is an overview of the underlying dataset used for training the IDS.")
130
+ df_view = load_dataset_view()
131
+ st.write("**Columns:**", df_view.columns.tolist())
132
+ st.write("**Shape:**", df_view.shape)
133
+ st.subheader("First 10 Rows of Training Data")
134
+ st.dataframe(df_view.head(10))
135
 
136
+ # -------- Tab 2: Test IDS --------
137
+ with tabs[1]:
138
+ st.header("Test the Intrusion Detection System")
139
+ st.markdown(
140
+ """
141
+ **Instructions:**
142
+ - Upload a CSV file containing network traffic data. The file should include key columns such as:
143
+ `Stime, Ltime, sbytes, dbytes, Spkts, Dpkts` among others.
144
+ - The system will preprocess the data and output prediction labels that indicate possible intrusions.
145
+ """
146
+ )
147
+ uploaded_file = st.file_uploader("Upload Network Traffic CSV", type=["csv"])
148
+ # Optionally, you can provide a button to load a sample test file if available.
149
+ if uploaded_file:
150
+ try:
151
+ test_df = pd.read_csv(uploaded_file)
152
+ st.subheader("Input Data Preview")
153
+ st.dataframe(test_df.head(10))
154
+
155
+ # Load the saved model artifacts needed for preprocessing and prediction.
156
+ features_to_drop, category_encodings, model = load_model_artifacts()
157
+ processed_test_df = preprocess_input(test_df, features_to_drop, category_encodings)
158
+
159
+ if processed_test_df is not None:
160
+ predictions = model.predict(processed_test_df)
161
+ st.subheader("IDS Predictions")
162
+ st.markdown(
163
+ """
164
+ The prediction output is based on the model's encoding. Each row's prediction corresponds to an attack category.
165
+ For example, a prediction of 13 indicates a 'normal' traffic instance, whereas other values may indicate specific intrusion types.
166
+ """
167
+ )
168
+ st.write(predictions)
169
+ else:
170
+ st.error("Preprocessing failed. Check that your data includes the necessary columns.")
171
+ except Exception as e:
172
+ st.error(f"An error occurred while processing the file: {e}")
173
+ else:
174
+ st.info("Awaiting CSV file upload for testing the Intrusion Detection System.")