Spaces:

xyncz
/

malicious-website-detection

Build error

App Files Files Community

malicious-website-detection / prediction.py

xyncz

Upload prediction.py

b0ce48c verified about 2 years ago

raw

history blame contribute delete

9.73 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import pickle

	# load all files

	with open("model.pkl", "rb") as f: # load the model
	model = pickle.load(f)

	with open("scaler.pkl", "rb") as f:
	scaler = pickle.load(f)

	with open("encoder.pkl", "rb") as f: # load the scaler
	encoder = pickle.load(f)

	with open('column_names.pkl', 'rb') as f:
	column_names = pickle.load(f)

	def app():

	with st.form('from_website_data'):
	# write short description about the model
	st.write('''
	# Malicious or Benign Website Detection
	- The model used for this detection is `XGBoost` Classifier which Hyperparameter have been tuned.
	- The model also used `SMOTENC` to handle imbalanced data during training.
	- This model achieved `0.93` recall score on the test set to detect malicious website.
	''')

	url = st.text_input('URL', 'https://www.google.com', help='The URL that will be analyzed')

	URL_LENGTH = len(url)

	NUMBER_SPECIAL_CHARACTERS = sum(not x.isalnum() for x in url)

	charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"}

	server_choice = {1: 'Apache', 2: 'cloudflare-nginx', 3: 'other', 4: 'Server', 5: 'GSE', 6: 'nginx', 7: 'unknown', 8: 'Microsoft-HTTPAPI/2.0', 9: 'nginx/1.8.0', 10: 'nginx/1.10.1', 11: 'Microsoft-IIS/7.5', 12: 'YouTubeFrontEnd', 13: 'Apache/2.2.22 (Debian)', 14: 'nginx/1.12.0', 15: 'Microsoft-IIS/6.0', 16: 'Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4', 17: 'Apache/2.2.14 (FreeBSD) mod_ssl/2.2.14 OpenSSL/0.9.8y DAV/2 PHP/5.2.12 with Suhosin-Patch'}

	whois_country_choice = {1: "AU", 2: "CA", 3: "ES", 4: "US", 5: "other", 6: "unknown", 7: "PA", 8: "FR", 9: "KR", 10: "CZ", 11: "JP", 12: "ru", 13: "UK", 14: "CN", 15: "GB", 16: "UY"}

	WHOIS_STATEPRO_choice = {1: "other", 2: "Barcelona", 3: "CA", 4: "NV", 5: "Washington", 6: "unknown", 7: "Arizona", 8: "UT", 9: "NY", 10: "ON", 11: "PA", 12: "FL", 13: "California", 14: "PRAHA", 15: "WA", 16: "Krasnoyarsk", 17: "Utah", 18: "WC1N"}

	CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()), help='The character encoding standard (also called character set)')

	SERVER = st.selectbox("Select Server", options=list(server_choice.values()), help='The operative system of the server got from the packet response')

	CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50, help='The content size of the HTTP header')

	WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()), help='The countries we got from the server response (specifically, our script used the API of Whois)')

	WHOIS_STATEPRO = st.selectbox("Select States", options=list(WHOIS_STATEPRO_choice.values()), help='The states we got from the server response (specifically, our script used the API of Whois)')

	WHOIS_REGDATE = st.date_input('WHOIS_REGDATE', help='Whois provides the server registration date')

	WHOIS_UPDATED_DATE = st.date_input('WHOIS_UPDATED_DATE', help='Through the Whois we got the last update date from the server analyzed')

	TCP_CONVERSATION_EXCHANGE = st.number_input('TCP_CONVERSATION_EXCHANGE', min_value=0, max_value=84, value=50, help='This variable is the number of TCP packets exchanged between the server and our honeypot client')

	DIST_REMOTE_TCP_PORT = st.number_input('DIST_REMOTE_TCP_PORT', min_value=0, max_value=20, value=0, help='It is the number of the ports detected and different to TCP')

	REMOTE_IPS = st.number_input('REMOTE_IPS', min_value=0, max_value=16, value=0, help='This variable has the total number of IPs connected to the honeypot')

	APP_BYTES = st.number_input('APP_BYTES', min_value=0, max_value=9302, value=50, help='This is the number of bytes transferred')

	REMOTE_APP_BYTES = st.number_input('REMOTE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes received from the server')

	SOURCE_APP_BYTES = st.number_input('SOURCE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes sent to the server')

	SOURCE_APP_PACKETS = st.number_input('SOURCE_APP_PACKETS', min_value=0, max_value=103, value=50, help='Packets sent from the honeypot to the server')

	REMOTE_APP_PACKETS = st.number_input('REMOTE_APP_PACKETS', min_value=0, max_value=99, value=50, help='Packets received from the server')

	APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50, help='This is the total number of IP packets generated during the communication between the honeypot and the server')

	DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=20, value=0, help='This is the number of DNS packets generated during the communication between the honeypot and the server')

	#submit buttion
	submitted = st.form_submit_button('Predict')

	data_inf = {
	'URL_LENGTH': URL_LENGTH,
	'NUMBER_SPECIAL_CHARACTERS': NUMBER_SPECIAL_CHARACTERS,
	'CONTENT_LENGTH': CONTENT_LENGTH,
	'WHOIS_REGDATE': WHOIS_REGDATE,
	'WHOIS_UPDATED_DATE': WHOIS_UPDATED_DATE,
	'TCP_CONVERSATION_EXCHANGE': TCP_CONVERSATION_EXCHANGE,
	'DIST_REMOTE_TCP_PORT': DIST_REMOTE_TCP_PORT,
	'REMOTE_IPS': REMOTE_IPS,
	'APP_BYTES': APP_BYTES,
	'SOURCE_APP_PACKETS': SOURCE_APP_PACKETS,
	'REMOTE_APP_PACKETS': REMOTE_APP_PACKETS,
	'SOURCE_APP_BYTES': SOURCE_APP_BYTES,
	'REMOTE_APP_BYTES': REMOTE_APP_BYTES,
	'APP_PACKETS': APP_PACKETS,
	'DNS_QUERY_TIMES': DNS_QUERY_TIMES,
	'CHARSET': CHARSET,
	'SERVER': SERVER,
	'WHOIS_COUNTRY': WHOIS_COUNTRY,
	'WHOIS_STATEPRO': WHOIS_STATEPRO
	}



	data_inf = pd.DataFrame([data_inf])
	# st.dataframe(data_inf)

	def encode_and_create_dataframe_train(df, column):
	# Fit a separate OneHotEncoder for the column
	transformed_data = encoder.fit_transform(df[[column]])

	# Get feature names for the column
	feature_names = encoder.get_feature_names_out(input_features=[column])

	# Create a DataFrame for the column
	transformed_df = pd.DataFrame(transformed_data.toarray(),
	index=df.index,
	columns=feature_names)

	return transformed_df, encoder

	# logic ketika user submit
	if submitted:
	#split between numerical and categorical columns
	data_inf_num = data_inf[['URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH',
	'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE', 'TCP_CONVERSATION_EXCHANGE',
	'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES', 'SOURCE_APP_PACKETS',
	'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES', 'APP_PACKETS',
	'DNS_QUERY_TIMES']]
	data_inf_cat = data_inf[['CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO']]

	# Convert to datetime format
	data_inf_num['WHOIS_REGDATE'] = pd.to_datetime(data_inf_num['WHOIS_REGDATE'])
	data_inf_num['WHOIS_UPDATED_DATE'] = pd.to_datetime(data_inf_num['WHOIS_UPDATED_DATE'])

	# Extract year as integer
	data_inf_num['WHOIS_REGDATE'] = data_inf_num['WHOIS_REGDATE'].dt.year
	data_inf_num['WHOIS_UPDATED_DATE'] = data_inf_num['WHOIS_UPDATED_DATE'].dt.year

	# scaling and encoding
	data_inf_num_scaled = scaler.transform(data_inf_num)

	# transform to dataframe
	data_inf_num_scaled = pd.DataFrame(data_inf_num_scaled, columns=data_inf_num.columns)

	capped_CHARSET, ohe_CHARSET = encode_and_create_dataframe_train(data_inf_cat, 'CHARSET')
	capped_SERVER, ohe_SERVER = encode_and_create_dataframe_train(data_inf_cat, 'SERVER')
	capped_WHOIS_COUNTRY, ohe_WHOIS_COUNTRY = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_COUNTRY')
	capped_WHOIS_STATEPRO, ohe_WHOIS_STATEPRO = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_STATEPRO')

	# concat all data
	data_inf_final = pd.concat([data_inf_num_scaled, capped_CHARSET, capped_SERVER, capped_WHOIS_COUNTRY, capped_WHOIS_STATEPRO], axis=1)

	if len(column_names) != len(set(column_names)):
	st.write("column_names contains duplicates")

	if len(data_inf_final.columns) != len(set(data_inf_final.columns)):
	st.write("data_inf_final has duplicate column names")

	# reindex to match the training columns
	data_inf_final = data_inf_final.reindex(columns=column_names)

	# Check Missing Values
	data_inf_final.isnull().sum()

	# fill null value with zeros
	data_inf_final = data_inf_final.fillna(0)

	#predict using linear reg model

	y_pred_inf = model.predict(data_inf_final)

	st.dataframe(data_inf)

	if y_pred_inf == 0:
	# write with green color
	st.markdown("<h1 style='text-align: center; color: green;'>Predicted Class: Benign</h1>", unsafe_allow_html=True)
	else:
	st.markdown("<h1 style='text-align: center; color: red;'>Predicted Class: Malicious</h1>", unsafe_allow_html=True)


	if __name__ == '__main__':
	app()