Spaces:

YEHTUT
/

PhishingWebDetector

Sleeping

App Files Files Community

PhishingWebDetector / app.py

YEHTUT

Update app.py

10d2035 verified over 1 year ago

raw

history blame contribute delete

8.41 kB

	import base64
	import pickle
	import numpy as np
	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestClassifier # Example model
	from sklearn.preprocessing import StandardScaler
	# Streamlit app title
	st.title('ITI105 Team Project')
	st.subheader('Machine Learning Project for Phishing web site prediction App')

	if 'clear_output' not in st.session_state:
	st.session_state.clear_output = False

	# Function to clear specific elements
	def clear_previous_output():
	st.session_state.clear_output = True

	# Load the pre-uploaded dataset
	default_file_path = 'https://raw.githubusercontent.com/JimmyYehtut/ITI105Files/main/test_dataset.csv'
	df_new = pd.read_csv(default_file_path)


	# Upload the CSV file
	uploaded_file = st.file_uploader("Choose a CSV file with website data", type="csv")
	row_index = None
	if uploaded_file is not None:
	# Read the CSV file
	df = pd.read_csv(uploaded_file)
	# st.write("Original Dataframe:", df)

	# Extract the URL column to display in the dropdown
	url_list = df['url'].tolist()

	# Display the dropdown with URL options
	selected_url = st.selectbox("Select URL for Prediction", url_list)

	# Display the list fo model
	selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4)


	# Remove the first (non-numeric) and last (target) columns
	if df.shape[1] > 2: # Ensure there are enough columns to remove
	features_df = df.iloc[:, 1:-1] # Drop first and last columns


	# Select a row for prediction
	# row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1)
	row_index = df[df['url'] == selected_url].index[0]
	# Display the selected row's features in a table
	selected_row = df.iloc[row_index, :]
	st.subheader("List of selected website features:")
	st.table(selected_row.to_frame().T)

	else:
	st.write("The dataset does not have enough columns after removing the first and last columns.")
	else:
	# st.error("ERROR!!! Please upload a CSV file to continue.")
	st.write("Using pre-uploaded sample data:")
	df = df_new
	# Extract the URL column to display in the dropdown
	url_list = df['url'].tolist()

	# Display the dropdown with URL options
	selected_url = st.selectbox("Select URL for Prediction", url_list)

	# Display the list fo model
	selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4)


	# Remove the first (non-numeric) and last (target) columns
	if df.shape[1] > 2: # Ensure there are enough columns to remove
	features_df = df.iloc[:, 1:-1] # Drop first and last columns


	# Select a row for prediction
	# row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1)
	row_index = df[df['url'] == selected_url].index[0]
	# Display the selected row's features in a table
	selected_row = df.iloc[row_index, :]
	st.subheader("List of selected website features:")
	st.table(selected_row.to_frame().T)

	else:
	st.write("The dataset does not have enough columns after removing the first and last columns.")

	if st.button("Predict"):

	# Clear previous st.success, st.error, and st.markdown elements
	clear_previous_output()
	file_ = open("It'ok.webp", "rb")
	contents = file_.read()
	data_url_ok = base64.b64encode(contents).decode("utf-8")
	file_.close()

	file = open("Warning.gif", "rb")
	contents = file.read()
	data_url_warning = base64.b64encode(contents).decode("utf-8")
	file.close()
	if row_index is not None:
	input_values = features_df.iloc[row_index].values # Get selected row data
	# st.write("Selected Features Dataframe for predicton:", input_values)
	# st.write("Selected Row Data (Features Only):", input_values)
	single_sample = np.array(input_values)
	# Dummy model for the purpose of this example
	# Normally you would load a pre-trained model or train one
	# X = features_df # Using the processed features data
	# y = [0]*len(df) # Dummy target variable for training the model (since we don't have a real target)

	# # Train/test split
	# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	# model = RandomForestClassifier()
	# model.fit(X_train, y_train)

	# Show progress spinner while making predictions
	with st.spinner('Making prediction...'):
	try:
	# Load the pre-trained scaler
	with open('scaler.pkl', 'rb') as f:
	scaler = pickle.load(f)
	# Scale the new data
	X_new_scaled = scaler.transform(single_sample.reshape(1, -1))

	# Load the selected model
	if selected_model == 'Logistic Regression':
	with open('best_logreg_model.pkl', 'rb') as f:
	model = pickle.load(f)
	elif selected_model == 'Decision Tree':
	with open('decision_tree_model.pkl', 'rb') as f:
	model = pickle.load(f)
	elif selected_model == 'KNN':
	with open('knn_model.pkl', 'rb') as f:
	model = pickle.load(f)
	elif selected_model == 'XGBoost':
	with open('xgboost_model.pkl', 'rb') as f:
	model = pickle.load(f)
	elif selected_model == 'Random Forest':
	with open('best_rf_model.pkl', 'rb') as f:
	model = pickle.load(f)
	elif selected_model == 'SVM':
	with open('best_svm_model.pkl', 'rb') as f:
	model = pickle.load(f)

	# Make predictions
	prediction = model.predict(X_new_scaled)
	y_pred_proba = model.predict_proba(X_new_scaled)
	# Display the prediction using st.write
	st.write(f"Prediction for the selected URL ({selected_url}): {prediction[0]}")
	# Display the prediction result
	if prediction[0] == 0 or prediction[0] == "legitimate":
	st.success("The website is not a phishing website.")
	st.markdown(f'<img src="data:image/gif;base64,{data_url_ok}" alt="cat gif">', unsafe_allow_html=True,)
	else:
	st.error("The website is a phishing website.")
	st.markdown(f'<img src="data:image/gif;base64,{data_url_warning}" alt="cat gif">', unsafe_allow_html=True,)

	# Visualize prediction confidence scores as a bar chart
	st.write("Prediction Confidence Scores:")
	class_names = model.classes_
	plt.figure(figsize=(8, 4))
	sns.barplot(x=class_names, y=y_pred_proba[0])
	plt.title("Prediction Confidence Scores")
	plt.xlabel("Class")
	plt.ylabel("Probability")
	st.pyplot(plt)

	except FileNotFoundError as e:
	st.error(f"Model file for {selected_model} not found: {str(e)}")
	except Exception as e:
	st.error(f"An error occurred while loading the model: {str(e)}")

	else:
	st.error("ERROR!!! Please provide web site information for prediction !!!")

	# This block clears the elements only if the prediction button is pressed
	if st.session_state.clear_output:
	st.session_state.clear_output = False
	# st.success("") # Clear any previous success messages
	# st.error("") # Clear any previous error messages
	# st.markdown("") # Clear any previous markdown content