MrUtakata commited on
Commit
588821a
·
verified ·
1 Parent(s): 5a22b15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -99
app.py CHANGED
@@ -1,109 +1,59 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import numpy as np
4
- import pickle
5
- import joblib
6
- import xgboost as xgb
7
-
8
- # --- Helper Functions ---
9
-
10
- @st.cache(allow_output_mutation=True)
11
- def load_artifacts():
12
- """
13
- Loads pre-saved artifacts:
14
- - features_to_drop.pkl: A set of columns to drop.
15
- - category_encodings.pkl: A dictionary containing encodings for categorical columns.
16
- - xgb_model.pkl: The trained XGBoost model.
17
- """
18
- with open("features_to_drop.pkl", "rb") as f:
19
- features_to_drop = pickle.load(f)
20
- with open("category_encodings.pkl", "rb") as f:
21
- category_encodings = pickle.load(f)
22
- xgb_model = joblib.load("xgb_model.pkl")
23
- return features_to_drop, category_encodings, xgb_model
24
-
25
- def preprocess_input(df, features_to_drop, category_encodings):
26
- """
27
- Preprocess incoming data to match training conditions.
28
-
29
- Expected input columns (at least) for feature engineering:
30
- - 'Ltime', 'Stime': Used to compute duration.
31
- - 'sbytes', 'dbytes': Used to compute byte_ratio.
32
- - 'Spkts', 'Dpkts': Used to compute pkt_ratio.
33
-
34
- Also, it drops the columns that were flagged as highly correlated and
35
- applies the same categorical encoding as done in training.
36
- """
37
- df = df.copy()
38
-
39
- # Convert expected numeric columns (if not already numeric)
40
- for col in ['Ltime', 'Stime', 'sbytes', 'dbytes', 'Spkts', 'Dpkts']:
41
- if col in df.columns:
42
- df[col] = pd.to_numeric(df[col], errors='coerce')
43
- else:
44
- st.error(f"Column '{col}' not found in the input data.")
45
- return None
46
 
47
- # Feature Engineering: calculate new features
48
- df["duration"] = df["Ltime"] - df["Stime"]
49
- df["byte_ratio"] = df["sbytes"] / (df["dbytes"] + 1)
50
- df["pkt_ratio"] = df["Spkts"] / (df["Dpkts"] + 1)
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- # Drop features (if present in the dataframe)
53
- drop_cols = list(features_to_drop.intersection(set(df.columns)))
54
- if drop_cols:
55
- df = df.drop(columns=drop_cols)
 
56
 
57
- # Encode categorical variables using the saved category encodings
58
- for col, categories in category_encodings.items():
59
- if col in df.columns:
60
- # Ensure the column is of type string first so that conversion to category works properly.
61
- df[col] = df[col].astype(str)
62
- df[col] = pd.Categorical(df[col], categories=categories)
63
- # The codes method will assign -1 for unknown categories.
64
- df[col] = df[col].cat.codes
65
 
66
- # Fill any remaining missing values if needed (this is customizable)
67
- df = df.fillna(0)
68
- return df
69
 
70
- # --- Load Artifacts ---
71
- features_to_drop, category_encodings, model = load_artifacts()
 
72
 
73
- # --- Streamlit Interface ---
74
- st.title("XGBoost Prediction App")
75
- st.markdown(
76
- """
77
- This app allows you to upload a CSV file of network data and then performs the same preprocessing steps
78
- used during training (feature engineering, dropping of highly correlated features, categorical encoding),
79
- and then applies a trained XGBoost model to generate predictions.
80
- """
81
- )
82
 
83
- st.header("Upload Input CSV")
84
- uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
 
85
 
86
- if uploaded_file is not None:
87
- try:
88
- # Read the CSV data
89
- input_df = pd.read_csv(uploaded_file)
90
- st.subheader("Raw Input Data")
91
- st.dataframe(input_df.head())
92
-
93
- # Preprocess the data to create model features
94
- preprocessed_df = preprocess_input(input_df, features_to_drop, category_encodings)
95
- if preprocessed_df is not None:
96
- st.subheader("Preprocessed Data")
97
- st.dataframe(preprocessed_df.head())
98
-
99
- # Predict using the loaded XGBoost model
100
- predictions = model.predict(preprocessed_df)
101
- # If your model is trained for multiclass classification, the predictions might be encoded labels.
102
- st.subheader("Predictions")
103
- st.write(predictions)
104
- else:
105
- st.error("Preprocessing failed. Please check the input data columns.")
106
- except Exception as e:
107
- st.error(f"Error processing file: {e}")
108
- else:
109
- st.info("Awaiting CSV file upload.")
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import gdown
4
+
5
+ @st.cache_data
6
+ def load_dataset_view():
7
+ # File IDs
8
+ NB15_features_file_id = '1CgOl-fuxrluSxPMsL-vTuB4uPraTko-W'
9
+ NB15_1_file_id = '1letlWY_VIVLEkrfCexpNAysnPMRDXEbA'
10
+ NB15_2_file_id = '1QzwdKNEqDKGECWCNtz9K3DAWMMPpn2NN'
11
+ NB15_3_file_id = '19NV-RSuAD6F_zBDiDPZa5Pe5_Z7Sjynl'
12
+ NB15_4_file_id = '1_cOQOoqKthkSzevzqxBGHUpS-BFHuJPz'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # Download URLs constructed with Google Drive sharing link format
15
+ urls = {
16
+ 'NB15_features.csv': f'https://drive.google.com/uc?id={NB15_features_file_id}',
17
+ 'NB15_1.csv': f'https://drive.google.com/uc?id={NB15_1_file_id}',
18
+ 'NB15_2.csv': f'https://drive.google.com/uc?id={NB15_2_file_id}',
19
+ 'NB15_3.csv': f'https://drive.google.com/uc?id={NB15_3_file_id}',
20
+ 'NB15_4.csv': f'https://drive.google.com/uc?id={NB15_4_file_id}',
21
+ }
22
+
23
+ # Download all necessary files
24
+ for filename, url in urls.items():
25
+ st.write(f"Downloading {filename}...")
26
+ gdown.download(url, filename, quiet=True)
27
+
28
+ # Load features to assign proper column names
29
+ NB15_features = pd.read_csv('NB15_features.csv', encoding='cp1252')
30
 
31
+ # Load datasets
32
+ NB15_1 = pd.read_csv('NB15_1.csv', dtype=str, low_memory=False)
33
+ NB15_2 = pd.read_csv('NB15_2.csv', dtype=str, low_memory=False)
34
+ NB15_3 = pd.read_csv('NB15_3.csv', dtype=str, low_memory=False)
35
+ NB15_4 = pd.read_csv('NB15_4.csv', dtype=str, low_memory=False)
36
 
37
+ # Assign feature names to each dataset
38
+ NB15_1.columns = NB15_features['Name']
39
+ NB15_2.columns = NB15_features['Name']
40
+ NB15_3.columns = NB15_features['Name']
41
+ NB15_4.columns = NB15_features['Name']
 
 
 
42
 
43
+ # Concatenate the datasets into a single DataFrame for a full view
44
+ train_df = pd.concat([NB15_1, NB15_2, NB15_3, NB15_4], ignore_index=True)
45
+ return train_df
46
 
47
+ # --- Streamlit UI for "Intrusion Detection System" Dataset View ---
48
+ st.title("Intrusion Detection System")
49
+ st.header("Dataset View")
50
 
51
+ df = load_dataset_view()
 
 
 
 
 
 
 
 
52
 
53
+ # Display general information about the dataset
54
+ st.write("**Dataset Columns:**", df.columns.tolist())
55
+ st.write("**Dataset Shape:**", df.shape)
56
 
57
+ # Display a sample of the dataset
58
+ st.subheader("First 10 Rows of the Dataset")
59
+ st.dataframe(df.head(10))