Spaces:
Sleeping
Sleeping
| import base64 | |
| import pickle | |
| import numpy as np | |
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.ensemble import RandomForestClassifier # Example model | |
| from sklearn.preprocessing import StandardScaler | |
| # Streamlit app title | |
| st.title('ITI105 Team Project') | |
| st.subheader('Machine Learning Project for Phishing web site prediction App') | |
| if 'clear_output' not in st.session_state: | |
| st.session_state.clear_output = False | |
| # Function to clear specific elements | |
| def clear_previous_output(): | |
| st.session_state.clear_output = True | |
| # Load the pre-uploaded dataset | |
| default_file_path = 'https://raw.githubusercontent.com/JimmyYehtut/ITI105Files/main/test_dataset.csv' | |
| df_new = pd.read_csv(default_file_path) | |
| # Upload the CSV file | |
| uploaded_file = st.file_uploader("Choose a CSV file with website data", type="csv") | |
| row_index = None | |
| if uploaded_file is not None: | |
| # Read the CSV file | |
| df = pd.read_csv(uploaded_file) | |
| # st.write("Original Dataframe:", df) | |
| # Extract the URL column to display in the dropdown | |
| url_list = df['url'].tolist() | |
| # Display the dropdown with URL options | |
| selected_url = st.selectbox("Select URL for Prediction", url_list) | |
| # Display the list fo model | |
| selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4) | |
| # Remove the first (non-numeric) and last (target) columns | |
| if df.shape[1] > 2: # Ensure there are enough columns to remove | |
| features_df = df.iloc[:, 1:-1] # Drop first and last columns | |
| # Select a row for prediction | |
| # row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1) | |
| row_index = df[df['url'] == selected_url].index[0] | |
| # Display the selected row's features in a table | |
| selected_row = df.iloc[row_index, :] | |
| st.subheader("List of selected website features:") | |
| st.table(selected_row.to_frame().T) | |
| else: | |
| st.write("The dataset does not have enough columns after removing the first and last columns.") | |
| else: | |
| # st.error("ERROR!!! Please upload a CSV file to continue.") | |
| st.write("Using pre-uploaded sample data:") | |
| df = df_new | |
| # Extract the URL column to display in the dropdown | |
| url_list = df['url'].tolist() | |
| # Display the dropdown with URL options | |
| selected_url = st.selectbox("Select URL for Prediction", url_list) | |
| # Display the list fo model | |
| selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4) | |
| # Remove the first (non-numeric) and last (target) columns | |
| if df.shape[1] > 2: # Ensure there are enough columns to remove | |
| features_df = df.iloc[:, 1:-1] # Drop first and last columns | |
| # Select a row for prediction | |
| # row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1) | |
| row_index = df[df['url'] == selected_url].index[0] | |
| # Display the selected row's features in a table | |
| selected_row = df.iloc[row_index, :] | |
| st.subheader("List of selected website features:") | |
| st.table(selected_row.to_frame().T) | |
| else: | |
| st.write("The dataset does not have enough columns after removing the first and last columns.") | |
| if st.button("Predict"): | |
| # Clear previous st.success, st.error, and st.markdown elements | |
| clear_previous_output() | |
| file_ = open("It'ok.webp", "rb") | |
| contents = file_.read() | |
| data_url_ok = base64.b64encode(contents).decode("utf-8") | |
| file_.close() | |
| file = open("Warning.gif", "rb") | |
| contents = file.read() | |
| data_url_warning = base64.b64encode(contents).decode("utf-8") | |
| file.close() | |
| if row_index is not None: | |
| input_values = features_df.iloc[row_index].values # Get selected row data | |
| # st.write("Selected Features Dataframe for predicton:", input_values) | |
| # st.write("Selected Row Data (Features Only):", input_values) | |
| single_sample = np.array(input_values) | |
| # Dummy model for the purpose of this example | |
| # Normally you would load a pre-trained model or train one | |
| # X = features_df # Using the processed features data | |
| # y = [0]*len(df) # Dummy target variable for training the model (since we don't have a real target) | |
| # # Train/test split | |
| # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # model = RandomForestClassifier() | |
| # model.fit(X_train, y_train) | |
| # Show progress spinner while making predictions | |
| with st.spinner('Making prediction...'): | |
| try: | |
| # Load the pre-trained scaler | |
| with open('scaler.pkl', 'rb') as f: | |
| scaler = pickle.load(f) | |
| # Scale the new data | |
| X_new_scaled = scaler.transform(single_sample.reshape(1, -1)) | |
| # Load the selected model | |
| if selected_model == 'Logistic Regression': | |
| with open('best_logreg_model.pkl', 'rb') as f: | |
| model = pickle.load(f) | |
| elif selected_model == 'Decision Tree': | |
| with open('decision_tree_model.pkl', 'rb') as f: | |
| model = pickle.load(f) | |
| elif selected_model == 'KNN': | |
| with open('knn_model.pkl', 'rb') as f: | |
| model = pickle.load(f) | |
| elif selected_model == 'XGBoost': | |
| with open('xgboost_model.pkl', 'rb') as f: | |
| model = pickle.load(f) | |
| elif selected_model == 'Random Forest': | |
| with open('best_rf_model.pkl', 'rb') as f: | |
| model = pickle.load(f) | |
| elif selected_model == 'SVM': | |
| with open('best_svm_model.pkl', 'rb') as f: | |
| model = pickle.load(f) | |
| # Make predictions | |
| prediction = model.predict(X_new_scaled) | |
| y_pred_proba = model.predict_proba(X_new_scaled) | |
| # Display the prediction using st.write | |
| st.write(f"Prediction for the selected URL ({selected_url}): **{prediction[0]}**") | |
| # Display the prediction result | |
| if prediction[0] == 0 or prediction[0] == "legitimate": | |
| st.success("The website is not a phishing website.") | |
| st.markdown(f'<img src="data:image/gif;base64,{data_url_ok}" alt="cat gif">', unsafe_allow_html=True,) | |
| else: | |
| st.error("The website is a phishing website.") | |
| st.markdown(f'<img src="data:image/gif;base64,{data_url_warning}" alt="cat gif">', unsafe_allow_html=True,) | |
| # Visualize prediction confidence scores as a bar chart | |
| st.write("Prediction Confidence Scores:") | |
| class_names = model.classes_ | |
| plt.figure(figsize=(8, 4)) | |
| sns.barplot(x=class_names, y=y_pred_proba[0]) | |
| plt.title("Prediction Confidence Scores") | |
| plt.xlabel("Class") | |
| plt.ylabel("Probability") | |
| st.pyplot(plt) | |
| except FileNotFoundError as e: | |
| st.error(f"Model file for {selected_model} not found: {str(e)}") | |
| except Exception as e: | |
| st.error(f"An error occurred while loading the model: {str(e)}") | |
| else: | |
| st.error("ERROR!!! Please provide web site information for prediction !!!") | |
| # This block clears the elements only if the prediction button is pressed | |
| if st.session_state.clear_output: | |
| st.session_state.clear_output = False | |
| # st.success("") # Clear any previous success messages | |
| # st.error("") # Clear any previous error messages | |
| # st.markdown("") # Clear any previous markdown content | |