File size: 8,408 Bytes
35b93a0
 
 
 
 
7ec9a06
 
35b93a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10d2035
35b93a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10d2035
35b93a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efea443
 
 
 
 
 
 
 
 
6fff34e
efea443
 
 
 
 
 
 
 
 
 
 
9449905
efea443
 
9449905
efea443
 
 
 
 
3202eb8
 
efea443
773c904
efea443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35b93a0
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import base64
import pickle
import numpy as np
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # Example model
from sklearn.preprocessing import StandardScaler
# Streamlit app title
st.title('ITI105 Team Project')
st.subheader('Machine Learning Project for Phishing web site prediction App')

if 'clear_output' not in st.session_state:
    st.session_state.clear_output = False

# Function to clear specific elements
def clear_previous_output():
    st.session_state.clear_output = True

# Load the pre-uploaded dataset
default_file_path = 'https://raw.githubusercontent.com/JimmyYehtut/ITI105Files/main/test_dataset.csv'
df_new = pd.read_csv(default_file_path)


# Upload the CSV file
uploaded_file = st.file_uploader("Choose a CSV file with website data", type="csv")
row_index = None
if uploaded_file is not None: 
    # Read the CSV file
    df = pd.read_csv(uploaded_file)
    # st.write("Original Dataframe:", df)

    # Extract the URL column to display in the dropdown
    url_list = df['url'].tolist()

    # Display the dropdown with URL options
    selected_url = st.selectbox("Select URL for Prediction", url_list)

    # Display the list fo model
    selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4)


    # Remove the first (non-numeric) and last (target) columns
    if df.shape[1] > 2:  # Ensure there are enough columns to remove
        features_df = df.iloc[:, 1:-1]  # Drop first and last columns
       

        # Select a row for prediction
        # row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1)
        row_index = df[df['url'] == selected_url].index[0]
        # Display the selected row's features in a table
        selected_row = df.iloc[row_index, :]
        st.subheader("List of selected website features:")
        st.table(selected_row.to_frame().T)
        
    else:
        st.write("The dataset does not have enough columns after removing the first and last columns.")
else:
    # st.error("ERROR!!! Please upload a CSV file to continue.")
    st.write("Using pre-uploaded sample data:")
    df = df_new
    # Extract the URL column to display in the dropdown
    url_list = df['url'].tolist()

    # Display the dropdown with URL options
    selected_url = st.selectbox("Select URL for Prediction", url_list)

    # Display the list fo model
    selected_model = st.selectbox("Select Model for Prediction", [ 'Logistic Regression', 'Decision Tree', 'KNN', 'XGBoost','Random Forest', 'SVM'],index=4)


    # Remove the first (non-numeric) and last (target) columns
    if df.shape[1] > 2:  # Ensure there are enough columns to remove
        features_df = df.iloc[:, 1:-1]  # Drop first and last columns
       

        # Select a row for prediction
        # row_index = st.number_input("Select a row index for prediction", min_value=0, max_value=len(features_df)-1, step=1)
        row_index = df[df['url'] == selected_url].index[0]
        # Display the selected row's features in a table
        selected_row = df.iloc[row_index, :]
        st.subheader("List of selected website features:")
        st.table(selected_row.to_frame().T)
        
    else:
        st.write("The dataset does not have enough columns after removing the first and last columns.")

if st.button("Predict"):

    # Clear previous st.success, st.error, and st.markdown elements
    clear_previous_output()
    file_ = open("It'ok.webp", "rb")
    contents = file_.read()
    data_url_ok = base64.b64encode(contents).decode("utf-8")
    file_.close()

    file = open("Warning.gif", "rb")
    contents = file.read()
    data_url_warning = base64.b64encode(contents).decode("utf-8")
    file.close()
    if row_index is not None:
            input_values = features_df.iloc[row_index].values  # Get selected row data
            # st.write("Selected Features Dataframe for predicton:", input_values)
            # st.write("Selected Row Data (Features Only):", input_values)
            single_sample = np.array(input_values)
            # Dummy model for the purpose of this example
            # Normally you would load a pre-trained model or train one
            # X = features_df  # Using the processed features data
            # y = [0]*len(df)  # Dummy target variable for training the model (since we don't have a real target)

            # # Train/test split
            # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            # model = RandomForestClassifier()
            # model.fit(X_train, y_train)

            # Show progress spinner while making predictions
            with st.spinner('Making prediction...'):
                try:
                    # Load the pre-trained scaler
                    with open('scaler.pkl', 'rb') as f:
                        scaler = pickle.load(f)
                    # Scale the new data
                    X_new_scaled = scaler.transform(single_sample.reshape(1, -1))
    
                    # Load the selected model
                    if selected_model == 'Logistic Regression':
                        with open('best_logreg_model.pkl', 'rb') as f:
                            model = pickle.load(f)
                    elif selected_model == 'Decision Tree':
                        with open('decision_tree_model.pkl', 'rb') as f:
                            model = pickle.load(f)
                    elif selected_model == 'KNN':
                        with open('knn_model.pkl', 'rb') as f:
                            model = pickle.load(f)
                    elif selected_model == 'XGBoost':
                        with open('xgboost_model.pkl', 'rb') as f:
                            model = pickle.load(f)
                    elif selected_model == 'Random Forest':
                        with open('best_rf_model.pkl', 'rb') as f:
                            model = pickle.load(f)
                    elif selected_model == 'SVM':
                        with open('best_svm_model.pkl', 'rb') as f:
                            model = pickle.load(f)
    
                    # Make predictions
                    prediction = model.predict(X_new_scaled)
                    y_pred_proba = model.predict_proba(X_new_scaled)
                    # Display the prediction using st.write
                    st.write(f"Prediction for the selected URL ({selected_url}): **{prediction[0]}**")
                    # Display the prediction result
                    if prediction[0] == 0 or prediction[0] == "legitimate":
                        st.success("The website is not a phishing website.")
                        st.markdown(f'<img src="data:image/gif;base64,{data_url_ok}" alt="cat gif">', unsafe_allow_html=True,)
                    else:
                        st.error("The website is a phishing website.")
                        st.markdown(f'<img src="data:image/gif;base64,{data_url_warning}" alt="cat gif">', unsafe_allow_html=True,)
    
                    # Visualize prediction confidence scores as a bar chart
                    st.write("Prediction Confidence Scores:")
                    class_names = model.classes_
                    plt.figure(figsize=(8, 4))
                    sns.barplot(x=class_names, y=y_pred_proba[0])
                    plt.title("Prediction Confidence Scores")
                    plt.xlabel("Class")
                    plt.ylabel("Probability")
                    st.pyplot(plt)
    
                except FileNotFoundError as e:
                    st.error(f"Model file for {selected_model} not found: {str(e)}")
                except Exception as e:
                    st.error(f"An error occurred while loading the model: {str(e)}")
           
    else:
        st.error("ERROR!!! Please provide web site information for prediction !!!")

# This block clears the elements only if the prediction button is pressed
if st.session_state.clear_output:
    st.session_state.clear_output = False
    # st.success("")  # Clear any previous success messages
    # st.error("")    # Clear any previous error messages
    # st.markdown("") # Clear any previous markdown content