Spaces:

MrUtakata
/

ids

Sleeping

App Files Files Community

MrUtakata commited on Apr 12, 2025

Commit

9b77caa

verified ·

1 Parent(s): c52ee84

Create app.py

Browse files

Files changed (1) hide show

app.py +109 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import pickle
+import joblib
+import xgboost as xgb
+# --- Helper Functions ---
+@st.cache(allow_output_mutation=True)
+def load_artifacts():
+    """
+    Loads pre-saved artifacts:
+      - features_to_drop.pkl: A set of columns to drop.
+      - category_encodings.pkl: A dictionary containing encodings for categorical columns.
+      - xgb_model.pkl: The trained XGBoost model.
+    """
+    with open("features_to_drop.pkl", "rb") as f:
+        features_to_drop = pickle.load(f)
+    with open("category_encodings.pkl", "rb") as f:
+        category_encodings = pickle.load(f)
+    xgb_model = joblib.load("xgb_model.pkl")
+    return features_to_drop, category_encodings, xgb_model
+def preprocess_input(df, features_to_drop, category_encodings):
+    """
+    Preprocess incoming data to match training conditions.
+    Expected input columns (at least) for feature engineering:
+      - 'Ltime', 'Stime': Used to compute duration.
+      - 'sbytes', 'dbytes': Used to compute byte_ratio.
+      - 'Spkts', 'Dpkts': Used to compute pkt_ratio.
+    Also, it drops the columns that were flagged as highly correlated and
+    applies the same categorical encoding as done in training.
+    """
+    df = df.copy()
+    # Convert expected numeric columns (if not already numeric)
+    for col in ['Ltime', 'Stime', 'sbytes', 'dbytes', 'Spkts', 'Dpkts']:
+        if col in df.columns:
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+        else:
+            st.error(f"Column '{col}' not found in the input data.")
+            return None
+    # Feature Engineering: calculate new features
+    df["duration"] = df["Ltime"] - df["Stime"]
+    df["byte_ratio"] = df["sbytes"] / (df["dbytes"] + 1)
+    df["pkt_ratio"] = df["Spkts"] / (df["Dpkts"] + 1)
+    # Drop features (if present in the dataframe)
+    drop_cols = list(features_to_drop.intersection(set(df.columns)))
+    if drop_cols:
+        df = df.drop(columns=drop_cols)
+    # Encode categorical variables using the saved category encodings
+    for col, categories in category_encodings.items():
+        if col in df.columns:
+            # Ensure the column is of type string first so that conversion to category works properly.
+            df[col] = df[col].astype(str)
+            df[col] = pd.Categorical(df[col], categories=categories)
+            # The codes method will assign -1 for unknown categories.
+            df[col] = df[col].cat.codes
+    # Fill any remaining missing values if needed (this is customizable)
+    df = df.fillna(0)
+    return df
+# --- Load Artifacts ---
+features_to_drop, category_encodings, model = load_artifacts()
+# --- Streamlit Interface ---
+st.title("XGBoost Prediction App")
+st.markdown(
+    """
+    This app allows you to upload a CSV file of network data and then performs the same preprocessing steps
+    used during training (feature engineering, dropping of highly correlated features, categorical encoding),
+    and then applies a trained XGBoost model to generate predictions.
+    """
+)
+st.header("Upload Input CSV")
+uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
+if uploaded_file is not None:
+    try:
+        # Read the CSV data
+        input_df = pd.read_csv(uploaded_file)
+        st.subheader("Raw Input Data")
+        st.dataframe(input_df.head())
+        # Preprocess the data to create model features
+        preprocessed_df = preprocess_input(input_df, features_to_drop, category_encodings)
+        if preprocessed_df is not None:
+            st.subheader("Preprocessed Data")
+            st.dataframe(preprocessed_df.head())
+            # Predict using the loaded XGBoost model
+            predictions = model.predict(preprocessed_df)
+            # If your model is trained for multiclass classification, the predictions might be encoded labels.
+            st.subheader("Predictions")
+            st.write(predictions)
+        else:
+            st.error("Preprocessing failed. Please check the input data columns.")
+    except Exception as e:
+        st.error(f"Error processing file: {e}")
+else:
+    st.info("Awaiting CSV file upload.")