Spaces:
Build error
Build error
File size: 9,733 Bytes
c2e943b 6f30b20 b0ce48c 6f30b20 c2e943b 6f30b20 c2e943b 6f30b20 4c97072 c2e943b 4c97072 6f30b20 4c97072 c2e943b 6f30b20 c2e943b 6f30b20 c2e943b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | import streamlit as st
import pandas as pd
import numpy as np
import pickle
# load all files
with open("model.pkl", "rb") as f: # load the model
model = pickle.load(f)
with open("scaler.pkl", "rb") as f:
scaler = pickle.load(f)
with open("encoder.pkl", "rb") as f: # load the scaler
encoder = pickle.load(f)
with open('column_names.pkl', 'rb') as f:
column_names = pickle.load(f)
def app():
with st.form('from_website_data'):
# write short description about the model
st.write('''
# **Malicious or Benign Website Detection**
- The model used for this detection is `XGBoost` Classifier which Hyperparameter have been tuned.
- The model also used `SMOTENC` to handle imbalanced data during training.
- This model achieved `0.93` recall score on the test set to detect malicious website.
''')
url = st.text_input('URL', 'https://www.google.com', help='The URL that will be analyzed')
URL_LENGTH = len(url)
NUMBER_SPECIAL_CHARACTERS = sum(not x.isalnum() for x in url)
charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"}
server_choice = {1: 'Apache', 2: 'cloudflare-nginx', 3: 'other', 4: 'Server', 5: 'GSE', 6: 'nginx', 7: 'unknown', 8: 'Microsoft-HTTPAPI/2.0', 9: 'nginx/1.8.0', 10: 'nginx/1.10.1', 11: 'Microsoft-IIS/7.5', 12: 'YouTubeFrontEnd', 13: 'Apache/2.2.22 (Debian)', 14: 'nginx/1.12.0', 15: 'Microsoft-IIS/6.0', 16: 'Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4', 17: 'Apache/2.2.14 (FreeBSD) mod_ssl/2.2.14 OpenSSL/0.9.8y DAV/2 PHP/5.2.12 with Suhosin-Patch'}
whois_country_choice = {1: "AU", 2: "CA", 3: "ES", 4: "US", 5: "other", 6: "unknown", 7: "PA", 8: "FR", 9: "KR", 10: "CZ", 11: "JP", 12: "ru", 13: "UK", 14: "CN", 15: "GB", 16: "UY"}
WHOIS_STATEPRO_choice = {1: "other", 2: "Barcelona", 3: "CA", 4: "NV", 5: "Washington", 6: "unknown", 7: "Arizona", 8: "UT", 9: "NY", 10: "ON", 11: "PA", 12: "FL", 13: "California", 14: "PRAHA", 15: "WA", 16: "Krasnoyarsk", 17: "Utah", 18: "WC1N"}
CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()), help='The character encoding standard (also called character set)')
SERVER = st.selectbox("Select Server", options=list(server_choice.values()), help='The operative system of the server got from the packet response')
CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50, help='The content size of the HTTP header')
WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()), help='The countries we got from the server response (specifically, our script used the API of Whois)')
WHOIS_STATEPRO = st.selectbox("Select States", options=list(WHOIS_STATEPRO_choice.values()), help='The states we got from the server response (specifically, our script used the API of Whois)')
WHOIS_REGDATE = st.date_input('WHOIS_REGDATE', help='Whois provides the server registration date')
WHOIS_UPDATED_DATE = st.date_input('WHOIS_UPDATED_DATE', help='Through the Whois we got the last update date from the server analyzed')
TCP_CONVERSATION_EXCHANGE = st.number_input('TCP_CONVERSATION_EXCHANGE', min_value=0, max_value=84, value=50, help='This variable is the number of TCP packets exchanged between the server and our honeypot client')
DIST_REMOTE_TCP_PORT = st.number_input('DIST_REMOTE_TCP_PORT', min_value=0, max_value=20, value=0, help='It is the number of the ports detected and different to TCP')
REMOTE_IPS = st.number_input('REMOTE_IPS', min_value=0, max_value=16, value=0, help='This variable has the total number of IPs connected to the honeypot')
APP_BYTES = st.number_input('APP_BYTES', min_value=0, max_value=9302, value=50, help='This is the number of bytes transferred')
REMOTE_APP_BYTES = st.number_input('REMOTE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes received from the server')
SOURCE_APP_BYTES = st.number_input('SOURCE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes sent to the server')
SOURCE_APP_PACKETS = st.number_input('SOURCE_APP_PACKETS', min_value=0, max_value=103, value=50, help='Packets sent from the honeypot to the server')
REMOTE_APP_PACKETS = st.number_input('REMOTE_APP_PACKETS', min_value=0, max_value=99, value=50, help='Packets received from the server')
APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50, help='This is the total number of IP packets generated during the communication between the honeypot and the server')
DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=20, value=0, help='This is the number of DNS packets generated during the communication between the honeypot and the server')
#submit buttion
submitted = st.form_submit_button('Predict')
data_inf = {
'URL_LENGTH': URL_LENGTH,
'NUMBER_SPECIAL_CHARACTERS': NUMBER_SPECIAL_CHARACTERS,
'CONTENT_LENGTH': CONTENT_LENGTH,
'WHOIS_REGDATE': WHOIS_REGDATE,
'WHOIS_UPDATED_DATE': WHOIS_UPDATED_DATE,
'TCP_CONVERSATION_EXCHANGE': TCP_CONVERSATION_EXCHANGE,
'DIST_REMOTE_TCP_PORT': DIST_REMOTE_TCP_PORT,
'REMOTE_IPS': REMOTE_IPS,
'APP_BYTES': APP_BYTES,
'SOURCE_APP_PACKETS': SOURCE_APP_PACKETS,
'REMOTE_APP_PACKETS': REMOTE_APP_PACKETS,
'SOURCE_APP_BYTES': SOURCE_APP_BYTES,
'REMOTE_APP_BYTES': REMOTE_APP_BYTES,
'APP_PACKETS': APP_PACKETS,
'DNS_QUERY_TIMES': DNS_QUERY_TIMES,
'CHARSET': CHARSET,
'SERVER': SERVER,
'WHOIS_COUNTRY': WHOIS_COUNTRY,
'WHOIS_STATEPRO': WHOIS_STATEPRO
}
data_inf = pd.DataFrame([data_inf])
# st.dataframe(data_inf)
def encode_and_create_dataframe_train(df, column):
# Fit a separate OneHotEncoder for the column
transformed_data = encoder.fit_transform(df[[column]])
# Get feature names for the column
feature_names = encoder.get_feature_names_out(input_features=[column])
# Create a DataFrame for the column
transformed_df = pd.DataFrame(transformed_data.toarray(),
index=df.index,
columns=feature_names)
return transformed_df, encoder
# logic ketika user submit
if submitted:
#split between numerical and categorical columns
data_inf_num = data_inf[['URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH',
'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE', 'TCP_CONVERSATION_EXCHANGE',
'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES', 'SOURCE_APP_PACKETS',
'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES', 'APP_PACKETS',
'DNS_QUERY_TIMES']]
data_inf_cat = data_inf[['CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO']]
# Convert to datetime format
data_inf_num['WHOIS_REGDATE'] = pd.to_datetime(data_inf_num['WHOIS_REGDATE'])
data_inf_num['WHOIS_UPDATED_DATE'] = pd.to_datetime(data_inf_num['WHOIS_UPDATED_DATE'])
# Extract year as integer
data_inf_num['WHOIS_REGDATE'] = data_inf_num['WHOIS_REGDATE'].dt.year
data_inf_num['WHOIS_UPDATED_DATE'] = data_inf_num['WHOIS_UPDATED_DATE'].dt.year
# scaling and encoding
data_inf_num_scaled = scaler.transform(data_inf_num)
# transform to dataframe
data_inf_num_scaled = pd.DataFrame(data_inf_num_scaled, columns=data_inf_num.columns)
capped_CHARSET, ohe_CHARSET = encode_and_create_dataframe_train(data_inf_cat, 'CHARSET')
capped_SERVER, ohe_SERVER = encode_and_create_dataframe_train(data_inf_cat, 'SERVER')
capped_WHOIS_COUNTRY, ohe_WHOIS_COUNTRY = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_COUNTRY')
capped_WHOIS_STATEPRO, ohe_WHOIS_STATEPRO = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_STATEPRO')
# concat all data
data_inf_final = pd.concat([data_inf_num_scaled, capped_CHARSET, capped_SERVER, capped_WHOIS_COUNTRY, capped_WHOIS_STATEPRO], axis=1)
if len(column_names) != len(set(column_names)):
st.write("column_names contains duplicates")
if len(data_inf_final.columns) != len(set(data_inf_final.columns)):
st.write("data_inf_final has duplicate column names")
# reindex to match the training columns
data_inf_final = data_inf_final.reindex(columns=column_names)
# Check Missing Values
data_inf_final.isnull().sum()
# fill null value with zeros
data_inf_final = data_inf_final.fillna(0)
#predict using linear reg model
y_pred_inf = model.predict(data_inf_final)
st.dataframe(data_inf)
if y_pred_inf == 0:
# write with green color
st.markdown("<h1 style='text-align: center; color: green;'>Predicted Class: Benign</h1>", unsafe_allow_html=True)
else:
st.markdown("<h1 style='text-align: center; color: red;'>Predicted Class: Malicious</h1>", unsafe_allow_html=True)
if __name__ == '__main__':
app() |