YEHTUT's picture
Update app.py
10d2035 verified
import base64
import pickle
import numpy as np
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # Example model
from sklearn.preprocessing import StandardScaler
# Streamlit app title
st.title('ITI105 Team Project')
st.subheader('Machine Learning Project for Phishing web site prediction App')
if 'clear_output' not in st.session_state:
st.session_state.clear_output = False
# Function to clear specific elements
def clear_previous_output():
st.session_state.clear_output = True
# Load the pre-uploaded dataset
default_file_path = 'https://raw.githubusercontent.com/JimmyYehtut/ITI105Files/main/test_dataset.csv'
df_new = pd.read_csv(default_file_path)
# Upload the CSV file
uploaded_file = st.file_uploader("Choose a CSV file with website data", type="csv")
row_index = None
if uploaded_file is not None:
# Read the CSV file
df = pd.read_csv(uploaded_file)
# st.write("Original Dataframe:", df)
# Extract the URL column to display in the dropdown
url_list = df['url'].tolist()
# Display the dropdown with URL options
selected_url = st.selectbox("Select URL for Prediction", url_list)
# Display the list fo model
selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4)
# Remove the first (non-numeric) and last (target) columns
if df.shape[1] > 2: # Ensure there are enough columns to remove
features_df = df.iloc[:, 1:-1] # Drop first and last columns
# Select a row for prediction
# row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1)
row_index = df[df['url'] == selected_url].index[0]
# Display the selected row's features in a table
selected_row = df.iloc[row_index, :]
st.subheader("List of selected website features:")
st.table(selected_row.to_frame().T)
else:
st.write("The dataset does not have enough columns after removing the first and last columns.")
else:
# st.error("ERROR!!! Please upload a CSV file to continue.")
st.write("Using pre-uploaded sample data:")
df = df_new
# Extract the URL column to display in the dropdown
url_list = df['url'].tolist()
# Display the dropdown with URL options
selected_url = st.selectbox("Select URL for Prediction", url_list)
# Display the list fo model
selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4)
# Remove the first (non-numeric) and last (target) columns
if df.shape[1] > 2: # Ensure there are enough columns to remove
features_df = df.iloc[:, 1:-1] # Drop first and last columns
# Select a row for prediction
# row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1)
row_index = df[df['url'] == selected_url].index[0]
# Display the selected row's features in a table
selected_row = df.iloc[row_index, :]
st.subheader("List of selected website features:")
st.table(selected_row.to_frame().T)
else:
st.write("The dataset does not have enough columns after removing the first and last columns.")
if st.button("Predict"):
# Clear previous st.success, st.error, and st.markdown elements
clear_previous_output()
file_ = open("It'ok.webp", "rb")
contents = file_.read()
data_url_ok = base64.b64encode(contents).decode("utf-8")
file_.close()
file = open("Warning.gif", "rb")
contents = file.read()
data_url_warning = base64.b64encode(contents).decode("utf-8")
file.close()
if row_index is not None:
input_values = features_df.iloc[row_index].values # Get selected row data
# st.write("Selected Features Dataframe for predicton:", input_values)
# st.write("Selected Row Data (Features Only):", input_values)
single_sample = np.array(input_values)
# Dummy model for the purpose of this example
# Normally you would load a pre-trained model or train one
# X = features_df # Using the processed features data
# y = [0]*len(df) # Dummy target variable for training the model (since we don't have a real target)
# # Train/test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# model = RandomForestClassifier()
# model.fit(X_train, y_train)
# Show progress spinner while making predictions
with st.spinner('Making prediction...'):
try:
# Load the pre-trained scaler
with open('scaler.pkl', 'rb') as f:
scaler = pickle.load(f)
# Scale the new data
X_new_scaled = scaler.transform(single_sample.reshape(1, -1))
# Load the selected model
if selected_model == 'Logistic Regression':
with open('best_logreg_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'Decision Tree':
with open('decision_tree_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'KNN':
with open('knn_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'XGBoost':
with open('xgboost_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'Random Forest':
with open('best_rf_model.pkl', 'rb') as f:
model = pickle.load(f)
elif selected_model == 'SVM':
with open('best_svm_model.pkl', 'rb') as f:
model = pickle.load(f)
# Make predictions
prediction = model.predict(X_new_scaled)
y_pred_proba = model.predict_proba(X_new_scaled)
# Display the prediction using st.write
st.write(f"Prediction for the selected URL ({selected_url}): **{prediction[0]}**")
# Display the prediction result
if prediction[0] == 0 or prediction[0] == "legitimate":
st.success("The website is not a phishing website.")
st.markdown(f'<img src="data:image/gif;base64,{data_url_ok}" alt="cat gif">', unsafe_allow_html=True,)
else:
st.error("The website is a phishing website.")
st.markdown(f'<img src="data:image/gif;base64,{data_url_warning}" alt="cat gif">', unsafe_allow_html=True,)
# Visualize prediction confidence scores as a bar chart
st.write("Prediction Confidence Scores:")
class_names = model.classes_
plt.figure(figsize=(8, 4))
sns.barplot(x=class_names, y=y_pred_proba[0])
plt.title("Prediction Confidence Scores")
plt.xlabel("Class")
plt.ylabel("Probability")
st.pyplot(plt)
except FileNotFoundError as e:
st.error(f"Model file for {selected_model} not found: {str(e)}")
except Exception as e:
st.error(f"An error occurred while loading the model: {str(e)}")
else:
st.error("ERROR!!! Please provide web site information for prediction !!!")
# This block clears the elements only if the prediction button is pressed
if st.session_state.clear_output:
st.session_state.clear_output = False
# st.success("") # Clear any previous success messages
# st.error("") # Clear any previous error messages
# st.markdown("") # Clear any previous markdown content