Spaces:
Sleeping
Sleeping
File size: 8,408 Bytes
35b93a0 7ec9a06 35b93a0 10d2035 35b93a0 10d2035 35b93a0 efea443 6fff34e efea443 9449905 efea443 9449905 efea443 3202eb8 efea443 773c904 efea443 35b93a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import base64
import pickle
import numpy as np
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # Example model
from sklearn.preprocessing import StandardScaler
# Streamlit app title
st.title('ITI105 Team Project')
st.subheader('Machine Learning Project for Phishing web site prediction App')
if 'clear_output' not in st.session_state:
st.session_state.clear_output = False
# Function to clear specific elements
def clear_previous_output():
st.session_state.clear_output = True
# Load the pre-uploaded dataset
default_file_path = 'https://raw.githubusercontent.com/JimmyYehtut/ITI105Files/main/test_dataset.csv'
df_new = pd.read_csv(default_file_path)
# Upload the CSV file
uploaded_file = st.file_uploader("Choose a CSV file with website data", type="csv")
row_index = None
if uploaded_file is not None:
# Read the CSV file
df = pd.read_csv(uploaded_file)
# st.write("Original Dataframe:", df)
# Extract the URL column to display in the dropdown
url_list = df['url'].tolist()
# Display the dropdown with URL options
selected_url = st.selectbox("Select URL for Prediction", url_list)
# Display the list fo model
selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4)
# Remove the first (non-numeric) and last (target) columns
if df.shape[1] > 2: # Ensure there are enough columns to remove
features_df = df.iloc[:, 1:-1] # Drop first and last columns
# Select a row for prediction
# row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1)
row_index = df[df['url'] == selected_url].index[0]
# Display the selected row's features in a table
selected_row = df.iloc[row_index, :]
st.subheader("List of selected website features:")
st.table(selected_row.to_frame().T)
else:
st.write("The dataset does not have enough columns after removing the first and last columns.")
else:
# st.error("ERROR!!! Please upload a CSV file to continue.")
st.write("Using pre-uploaded sample data:")
df = df_new
# Extract the URL column to display in the dropdown
url_list = df['url'].tolist()
# Display the dropdown with URL options
selected_url = st.selectbox("Select URL for Prediction", url_list)
# Display the list fo model
selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4)
# Remove the first (non-numeric) and last (target) columns
if df.shape[1] > 2: # Ensure there are enough columns to remove
features_df = df.iloc[:, 1:-1] # Drop first and last columns
# Select a row for prediction
# row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1)
row_index = df[df['url'] == selected_url].index[0]
# Display the selected row's features in a table
selected_row = df.iloc[row_index, :]
st.subheader("List of selected website features:")
st.table(selected_row.to_frame().T)
else:
st.write("The dataset does not have enough columns after removing the first and last columns.")
if st.button("Predict"):
# Clear previous st.success, st.error, and st.markdown elements
clear_previous_output()
file_ = open("It'ok.webp", "rb")
contents = file_.read()
data_url_ok = base64.b64encode(contents).decode("utf-8")
file_.close()
file = open("Warning.gif", "rb")
contents = file.read()
data_url_warning = base64.b64encode(contents).decode("utf-8")
file.close()
if row_index is not None:
input_values = features_df.iloc[row_index].values # Get selected row data
# st.write("Selected Features Dataframe for predicton:", input_values)
# st.write("Selected Row Data (Features Only):", input_values)
single_sample = np.array(input_values)
# Dummy model for the purpose of this example
# Normally you would load a pre-trained model or train one
# X = features_df # Using the processed features data
# y = [0]*len(df) # Dummy target variable for training the model (since we don't have a real target)
# # Train/test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# model = RandomForestClassifier()
# model.fit(X_train, y_train)
# Show progress spinner while making predictions
with st.spinner('Making prediction...'):
try:
# Load the pre-trained scaler
with open('scaler.pkl', 'rb') as f:
scaler = pickle.load(f)
# Scale the new data
X_new_scaled = scaler.transform(single_sample.reshape(1, -1))
# Load the selected model
if selected_model == 'Logistic Regression':
with open('best_logreg_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'Decision Tree':
with open('decision_tree_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'KNN':
with open('knn_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'XGBoost':
with open('xgboost_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'Random Forest':
with open('best_rf_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'SVM':
with open('best_svm_model.pkl', 'rb') as f:
model = pickle.load(f)
# Make predictions
prediction = model.predict(X_new_scaled)
y_pred_proba = model.predict_proba(X_new_scaled)
# Display the prediction using st.write
st.write(f"Prediction for the selected URL ({selected_url}): **{prediction[0]}**")
# Display the prediction result
if prediction[0] == 0 or prediction[0] == "legitimate":
st.success("The website is not a phishing website.")
st.markdown(f'<img src="data:image/gif;base64,{data_url_ok}" alt="cat gif">', unsafe_allow_html=True,)
else:
st.error("The website is a phishing website.")
st.markdown(f'<img src="data:image/gif;base64,{data_url_warning}" alt="cat gif">', unsafe_allow_html=True,)
# Visualize prediction confidence scores as a bar chart
st.write("Prediction Confidence Scores:")
class_names = model.classes_
plt.figure(figsize=(8, 4))
sns.barplot(x=class_names, y=y_pred_proba[0])
plt.title("Prediction Confidence Scores")
plt.xlabel("Class")
plt.ylabel("Probability")
st.pyplot(plt)
except FileNotFoundError as e:
st.error(f"Model file for {selected_model} not found: {str(e)}")
except Exception as e:
st.error(f"An error occurred while loading the model: {str(e)}")
else:
st.error("ERROR!!! Please provide web site information for prediction !!!")
# This block clears the elements only if the prediction button is pressed
if st.session_state.clear_output:
st.session_state.clear_output = False
# st.success("") # Clear any previous success messages
# st.error("") # Clear any previous error messages
# st.markdown("") # Clear any previous markdown content
|