import base64
import pickle
import numpy as np
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # Example model
from sklearn.preprocessing import StandardScaler
# Streamlit app title
st.title('ITI105 Team Project')
st.subheader('Machine Learning Project for Phishing web site prediction App')
if 'clear_output' not in st.session_state:
st.session_state.clear_output = False
# Function to clear specific elements
def clear_previous_output():
st.session_state.clear_output = True
# Load the pre-uploaded dataset
default_file_path = 'https://raw.githubusercontent.com/JimmyYehtut/ITI105Files/main/test_dataset.csv'
df_new = pd.read_csv(default_file_path)
# Upload the CSV file
uploaded_file = st.file_uploader("Choose a CSV file with website data", type="csv")
row_index = None
if uploaded_file is not None:
# Read the CSV file
df = pd.read_csv(uploaded_file)
# st.write("Original Dataframe:", df)
# Extract the URL column to display in the dropdown
url_list = df['url'].tolist()
# Display the dropdown with URL options
selected_url = st.selectbox("Select URL for Prediction", url_list)
# Display the list fo model
selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4)
# Remove the first (non-numeric) and last (target) columns
if df.shape[1] > 2: # Ensure there are enough columns to remove
features_df = df.iloc[:, 1:-1] # Drop first and last columns
# Select a row for prediction
# row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1)
row_index = df[df['url'] == selected_url].index[0]
# Display the selected row's features in a table
selected_row = df.iloc[row_index, :]
st.subheader("List of selected website features:")
st.table(selected_row.to_frame().T)
else:
st.write("The dataset does not have enough columns after removing the first and last columns.")
else:
# st.error("ERROR!!! Please upload a CSV file to continue.")
st.write("Using pre-uploaded sample data:")
df = df_new
# Extract the URL column to display in the dropdown
url_list = df['url'].tolist()
# Display the dropdown with URL options
selected_url = st.selectbox("Select URL for Prediction", url_list)
# Display the list fo model
selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4)
# Remove the first (non-numeric) and last (target) columns
if df.shape[1] > 2: # Ensure there are enough columns to remove
features_df = df.iloc[:, 1:-1] # Drop first and last columns
# Select a row for prediction
# row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1)
row_index = df[df['url'] == selected_url].index[0]
# Display the selected row's features in a table
selected_row = df.iloc[row_index, :]
st.subheader("List of selected website features:")
st.table(selected_row.to_frame().T)
else:
st.write("The dataset does not have enough columns after removing the first and last columns.")
if st.button("Predict"):
# Clear previous st.success, st.error, and st.markdown elements
clear_previous_output()
file_ = open("It'ok.webp", "rb")
contents = file_.read()
data_url_ok = base64.b64encode(contents).decode("utf-8")
file_.close()
file = open("Warning.gif", "rb")
contents = file.read()
data_url_warning = base64.b64encode(contents).decode("utf-8")
file.close()
if row_index is not None:
input_values = features_df.iloc[row_index].values # Get selected row data
# st.write("Selected Features Dataframe for predicton:", input_values)
# st.write("Selected Row Data (Features Only):", input_values)
single_sample = np.array(input_values)
# Dummy model for the purpose of this example
# Normally you would load a pre-trained model or train one
# X = features_df # Using the processed features data
# y = [0]*len(df) # Dummy target variable for training the model (since we don't have a real target)
# # Train/test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# model = RandomForestClassifier()
# model.fit(X_train, y_train)
# Show progress spinner while making predictions
with st.spinner('Making prediction...'):
try:
# Load the pre-trained scaler
with open('scaler.pkl', 'rb') as f:
scaler = pickle.load(f)
# Scale the new data
X_new_scaled = scaler.transform(single_sample.reshape(1, -1))
# Load the selected model
if selected_model == 'Logistic Regression':
with open('best_logreg_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'Decision Tree':
with open('decision_tree_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'KNN':
with open('knn_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'XGBoost':
with open('xgboost_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'Random Forest':
with open('best_rf_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'SVM':
with open('best_svm_model.pkl', 'rb') as f:
model = pickle.load(f)
# Make predictions
prediction = model.predict(X_new_scaled)
y_pred_proba = model.predict_proba(X_new_scaled)
# Display the prediction using st.write
st.write(f"Prediction for the selected URL ({selected_url}): **{prediction[0]}**")
# Display the prediction result
if prediction[0] == 0 or prediction[0] == "legitimate":
st.success("The website is not a phishing website.")
st.markdown(f'
', unsafe_allow_html=True,)
else:
st.error("The website is a phishing website.")
st.markdown(f'
', unsafe_allow_html=True,)
# Visualize prediction confidence scores as a bar chart
st.write("Prediction Confidence Scores:")
class_names = model.classes_
plt.figure(figsize=(8, 4))
sns.barplot(x=class_names, y=y_pred_proba[0])
plt.title("Prediction Confidence Scores")
plt.xlabel("Class")
plt.ylabel("Probability")
st.pyplot(plt)
except FileNotFoundError as e:
st.error(f"Model file for {selected_model} not found: {str(e)}")
except Exception as e:
st.error(f"An error occurred while loading the model: {str(e)}")
else:
st.error("ERROR!!! Please provide web site information for prediction !!!")
# This block clears the elements only if the prediction button is pressed
if st.session_state.clear_output:
st.session_state.clear_output = False
# st.success("") # Clear any previous success messages
# st.error("") # Clear any previous error messages
# st.markdown("") # Clear any previous markdown content