Spaces:

MrUtakata
/

ids

Sleeping

App Files Files Community

MrUtakata commited on Apr 12, 2025

Commit

588821a

verified ·

1 Parent(s): 5a22b15

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -99

app.py CHANGED Viewed

@@ -1,109 +1,59 @@
 import streamlit as st
 import pandas as pd
-import numpy as np
-import pickle
-import joblib
-import xgboost as xgb
-# --- Helper Functions ---
-@st.cache(allow_output_mutation=True)
-def load_artifacts():
-    """
-    Loads pre-saved artifacts:
-      - features_to_drop.pkl: A set of columns to drop.
-      - category_encodings.pkl: A dictionary containing encodings for categorical columns.
-      - xgb_model.pkl: The trained XGBoost model.
-    """
-    with open("features_to_drop.pkl", "rb") as f:
-        features_to_drop = pickle.load(f)
-    with open("category_encodings.pkl", "rb") as f:
-        category_encodings = pickle.load(f)
-    xgb_model = joblib.load("xgb_model.pkl")
-    return features_to_drop, category_encodings, xgb_model
-def preprocess_input(df, features_to_drop, category_encodings):
-    """
-    Preprocess incoming data to match training conditions.
-    Expected input columns (at least) for feature engineering:
-      - 'Ltime', 'Stime': Used to compute duration.
-      - 'sbytes', 'dbytes': Used to compute byte_ratio.
-      - 'Spkts', 'Dpkts': Used to compute pkt_ratio.
-    Also, it drops the columns that were flagged as highly correlated and
-    applies the same categorical encoding as done in training.
-    """
-    df = df.copy()
-    # Convert expected numeric columns (if not already numeric)
-    for col in ['Ltime', 'Stime', 'sbytes', 'dbytes', 'Spkts', 'Dpkts']:
-        if col in df.columns:
-            df[col] = pd.to_numeric(df[col], errors='coerce')
-        else:
-            st.error(f"Column '{col}' not found in the input data.")
-            return None
-    # Feature Engineering: calculate new features
-    df["duration"] = df["Ltime"] - df["Stime"]
-    df["byte_ratio"] = df["sbytes"] / (df["dbytes"] + 1)
-    df["pkt_ratio"] = df["Spkts"] / (df["Dpkts"] + 1)
-    # Drop features (if present in the dataframe)
-    drop_cols = list(features_to_drop.intersection(set(df.columns)))
-    if drop_cols:
-        df = df.drop(columns=drop_cols)
-    # Encode categorical variables using the saved category encodings
-    for col, categories in category_encodings.items():
-        if col in df.columns:
-            # Ensure the column is of type string first so that conversion to category works properly.
-            df[col] = df[col].astype(str)
-            df[col] = pd.Categorical(df[col], categories=categories)
-            # The codes method will assign -1 for unknown categories.
-            df[col] = df[col].cat.codes
-    # Fill any remaining missing values if needed (this is customizable)
-    df = df.fillna(0)
-    return df
-# --- Load Artifacts ---
-features_to_drop, category_encodings, model = load_artifacts()
-# --- Streamlit Interface ---
-st.title("XGBoost Prediction App")
-st.markdown(
-    """
-    This app allows you to upload a CSV file of network data and then performs the same preprocessing steps
-    used during training (feature engineering, dropping of highly correlated features, categorical encoding),
-    and then applies a trained XGBoost model to generate predictions.
-    """
-)
-st.header("Upload Input CSV")
-uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
-if uploaded_file is not None:
-    try:
-        # Read the CSV data
-        input_df = pd.read_csv(uploaded_file)
-        st.subheader("Raw Input Data")
-        st.dataframe(input_df.head())
-        # Preprocess the data to create model features
-        preprocessed_df = preprocess_input(input_df, features_to_drop, category_encodings)
-        if preprocessed_df is not None:
-            st.subheader("Preprocessed Data")
-            st.dataframe(preprocessed_df.head())
-            # Predict using the loaded XGBoost model
-            predictions = model.predict(preprocessed_df)
-            # If your model is trained for multiclass classification, the predictions might be encoded labels.
-            st.subheader("Predictions")
-            st.write(predictions)
-        else:
-            st.error("Preprocessing failed. Please check the input data columns.")
-    except Exception as e:
-        st.error(f"Error processing file: {e}")
-else:
-    st.info("Awaiting CSV file upload.")

 import streamlit as st
 import pandas as pd
+import gdown
+@st.cache_data
+def load_dataset_view():
+    # File IDs
+    NB15_features_file_id = '1CgOl-fuxrluSxPMsL-vTuB4uPraTko-W'
+    NB15_1_file_id = '1letlWY_VIVLEkrfCexpNAysnPMRDXEbA'
+    NB15_2_file_id = '1QzwdKNEqDKGECWCNtz9K3DAWMMPpn2NN'
+    NB15_3_file_id = '19NV-RSuAD6F_zBDiDPZa5Pe5_Z7Sjynl'
+    NB15_4_file_id = '1_cOQOoqKthkSzevzqxBGHUpS-BFHuJPz'
+    # Download URLs constructed with Google Drive sharing link format
+    urls = {
+        'NB15_features.csv': f'https://drive.google.com/uc?id={NB15_features_file_id}',
+        'NB15_1.csv': f'https://drive.google.com/uc?id={NB15_1_file_id}',
+        'NB15_2.csv': f'https://drive.google.com/uc?id={NB15_2_file_id}',
+        'NB15_3.csv': f'https://drive.google.com/uc?id={NB15_3_file_id}',
+        'NB15_4.csv': f'https://drive.google.com/uc?id={NB15_4_file_id}',
+    }
+    # Download all necessary files
+    for filename, url in urls.items():
+        st.write(f"Downloading {filename}...")
+        gdown.download(url, filename, quiet=True)
+    # Load features to assign proper column names
+    NB15_features = pd.read_csv('NB15_features.csv', encoding='cp1252')
+    # Load datasets
+    NB15_1 = pd.read_csv('NB15_1.csv', dtype=str, low_memory=False)
+    NB15_2 = pd.read_csv('NB15_2.csv', dtype=str, low_memory=False)
+    NB15_3 = pd.read_csv('NB15_3.csv', dtype=str, low_memory=False)
+    NB15_4 = pd.read_csv('NB15_4.csv', dtype=str, low_memory=False)
+    # Assign feature names to each dataset
+    NB15_1.columns = NB15_features['Name']
+    NB15_2.columns = NB15_features['Name']
+    NB15_3.columns = NB15_features['Name']
+    NB15_4.columns = NB15_features['Name']
+    # Concatenate the datasets into a single DataFrame for a full view
+    train_df = pd.concat([NB15_1, NB15_2, NB15_3, NB15_4], ignore_index=True)
+    return train_df
+# --- Streamlit UI for "Intrusion Detection System" Dataset View ---
+st.title("Intrusion Detection System")
+st.header("Dataset View")
+df = load_dataset_view()
+# Display general information about the dataset
+st.write("**Dataset Columns:**", df.columns.tolist())
+st.write("**Dataset Shape:**", df.shape)
+# Display a sample of the dataset
+st.subheader("First 10 Rows of the Dataset")
+st.dataframe(df.head(10))