MrUtakata commited on
Commit
9b77caa
·
verified ·
1 Parent(s): c52ee84

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import pickle
5
+ import joblib
6
+ import xgboost as xgb
7
+
8
+ # --- Helper Functions ---
9
+
10
+ @st.cache(allow_output_mutation=True)
11
+ def load_artifacts():
12
+ """
13
+ Loads pre-saved artifacts:
14
+ - features_to_drop.pkl: A set of columns to drop.
15
+ - category_encodings.pkl: A dictionary containing encodings for categorical columns.
16
+ - xgb_model.pkl: The trained XGBoost model.
17
+ """
18
+ with open("features_to_drop.pkl", "rb") as f:
19
+ features_to_drop = pickle.load(f)
20
+ with open("category_encodings.pkl", "rb") as f:
21
+ category_encodings = pickle.load(f)
22
+ xgb_model = joblib.load("xgb_model.pkl")
23
+ return features_to_drop, category_encodings, xgb_model
24
+
25
+ def preprocess_input(df, features_to_drop, category_encodings):
26
+ """
27
+ Preprocess incoming data to match training conditions.
28
+
29
+ Expected input columns (at least) for feature engineering:
30
+ - 'Ltime', 'Stime': Used to compute duration.
31
+ - 'sbytes', 'dbytes': Used to compute byte_ratio.
32
+ - 'Spkts', 'Dpkts': Used to compute pkt_ratio.
33
+
34
+ Also, it drops the columns that were flagged as highly correlated and
35
+ applies the same categorical encoding as done in training.
36
+ """
37
+ df = df.copy()
38
+
39
+ # Convert expected numeric columns (if not already numeric)
40
+ for col in ['Ltime', 'Stime', 'sbytes', 'dbytes', 'Spkts', 'Dpkts']:
41
+ if col in df.columns:
42
+ df[col] = pd.to_numeric(df[col], errors='coerce')
43
+ else:
44
+ st.error(f"Column '{col}' not found in the input data.")
45
+ return None
46
+
47
+ # Feature Engineering: calculate new features
48
+ df["duration"] = df["Ltime"] - df["Stime"]
49
+ df["byte_ratio"] = df["sbytes"] / (df["dbytes"] + 1)
50
+ df["pkt_ratio"] = df["Spkts"] / (df["Dpkts"] + 1)
51
+
52
+ # Drop features (if present in the dataframe)
53
+ drop_cols = list(features_to_drop.intersection(set(df.columns)))
54
+ if drop_cols:
55
+ df = df.drop(columns=drop_cols)
56
+
57
+ # Encode categorical variables using the saved category encodings
58
+ for col, categories in category_encodings.items():
59
+ if col in df.columns:
60
+ # Ensure the column is of type string first so that conversion to category works properly.
61
+ df[col] = df[col].astype(str)
62
+ df[col] = pd.Categorical(df[col], categories=categories)
63
+ # The codes method will assign -1 for unknown categories.
64
+ df[col] = df[col].cat.codes
65
+
66
+ # Fill any remaining missing values if needed (this is customizable)
67
+ df = df.fillna(0)
68
+ return df
69
+
70
+ # --- Load Artifacts ---
71
+ features_to_drop, category_encodings, model = load_artifacts()
72
+
73
+ # --- Streamlit Interface ---
74
+ st.title("XGBoost Prediction App")
75
+ st.markdown(
76
+ """
77
+ This app allows you to upload a CSV file of network data and then performs the same preprocessing steps
78
+ used during training (feature engineering, dropping of highly correlated features, categorical encoding),
79
+ and then applies a trained XGBoost model to generate predictions.
80
+ """
81
+ )
82
+
83
+ st.header("Upload Input CSV")
84
+ uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
85
+
86
+ if uploaded_file is not None:
87
+ try:
88
+ # Read the CSV data
89
+ input_df = pd.read_csv(uploaded_file)
90
+ st.subheader("Raw Input Data")
91
+ st.dataframe(input_df.head())
92
+
93
+ # Preprocess the data to create model features
94
+ preprocessed_df = preprocess_input(input_df, features_to_drop, category_encodings)
95
+ if preprocessed_df is not None:
96
+ st.subheader("Preprocessed Data")
97
+ st.dataframe(preprocessed_df.head())
98
+
99
+ # Predict using the loaded XGBoost model
100
+ predictions = model.predict(preprocessed_df)
101
+ # If your model is trained for multiclass classification, the predictions might be encoded labels.
102
+ st.subheader("Predictions")
103
+ st.write(predictions)
104
+ else:
105
+ st.error("Preprocessing failed. Please check the input data columns.")
106
+ except Exception as e:
107
+ st.error(f"Error processing file: {e}")
108
+ else:
109
+ st.info("Awaiting CSV file upload.")