import streamlit as st import pandas as pd import numpy as np import pickle # load all files with open("model.pkl", "rb") as f: # load the model model = pickle.load(f) with open("scaler.pkl", "rb") as f: scaler = pickle.load(f) with open("encoder.pkl", "rb") as f: # load the scaler encoder = pickle.load(f) with open('column_names.pkl', 'rb') as f: column_names = pickle.load(f) def app(): with st.form('from_website_data'): # write short description about the model st.write(''' # **Malicious or Benign Website Detection** - The model used for this detection is `XGBoost` Classifier which Hyperparameter have been tuned. - The model also used `SMOTENC` to handle imbalanced data during training. - This model achieved `0.93` recall score on the test set to detect malicious website. ''') url = st.text_input('URL', 'https://www.google.com', help='The URL that will be analyzed') URL_LENGTH = len(url) NUMBER_SPECIAL_CHARACTERS = sum(not x.isalnum() for x in url) charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"} server_choice = {1: 'Apache', 2: 'cloudflare-nginx', 3: 'other', 4: 'Server', 5: 'GSE', 6: 'nginx', 7: 'unknown', 8: 'Microsoft-HTTPAPI/2.0', 9: 'nginx/1.8.0', 10: 'nginx/1.10.1', 11: 'Microsoft-IIS/7.5', 12: 'YouTubeFrontEnd', 13: 'Apache/2.2.22 (Debian)', 14: 'nginx/1.12.0', 15: 'Microsoft-IIS/6.0', 16: 'Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4', 17: 'Apache/2.2.14 (FreeBSD) mod_ssl/2.2.14 OpenSSL/0.9.8y DAV/2 PHP/5.2.12 with Suhosin-Patch'} whois_country_choice = {1: "AU", 2: "CA", 3: "ES", 4: "US", 5: "other", 6: "unknown", 7: "PA", 8: "FR", 9: "KR", 10: "CZ", 11: "JP", 12: "ru", 13: "UK", 14: "CN", 15: "GB", 16: "UY"} WHOIS_STATEPRO_choice = {1: "other", 2: "Barcelona", 3: "CA", 4: "NV", 5: "Washington", 6: "unknown", 7: "Arizona", 8: "UT", 9: "NY", 10: "ON", 11: "PA", 12: "FL", 13: "California", 14: "PRAHA", 15: "WA", 16: "Krasnoyarsk", 17: "Utah", 18: "WC1N"} CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()), help='The character encoding standard (also called character set)') SERVER = st.selectbox("Select Server", options=list(server_choice.values()), help='The operative system of the server got from the packet response') CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50, help='The content size of the HTTP header') WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()), help='The countries we got from the server response (specifically, our script used the API of Whois)') WHOIS_STATEPRO = st.selectbox("Select States", options=list(WHOIS_STATEPRO_choice.values()), help='The states we got from the server response (specifically, our script used the API of Whois)') WHOIS_REGDATE = st.date_input('WHOIS_REGDATE', help='Whois provides the server registration date') WHOIS_UPDATED_DATE = st.date_input('WHOIS_UPDATED_DATE', help='Through the Whois we got the last update date from the server analyzed') TCP_CONVERSATION_EXCHANGE = st.number_input('TCP_CONVERSATION_EXCHANGE', min_value=0, max_value=84, value=50, help='This variable is the number of TCP packets exchanged between the server and our honeypot client') DIST_REMOTE_TCP_PORT = st.number_input('DIST_REMOTE_TCP_PORT', min_value=0, max_value=20, value=0, help='It is the number of the ports detected and different to TCP') REMOTE_IPS = st.number_input('REMOTE_IPS', min_value=0, max_value=16, value=0, help='This variable has the total number of IPs connected to the honeypot') APP_BYTES = st.number_input('APP_BYTES', min_value=0, max_value=9302, value=50, help='This is the number of bytes transferred') REMOTE_APP_BYTES = st.number_input('REMOTE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes received from the server') SOURCE_APP_BYTES = st.number_input('SOURCE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes sent to the server') SOURCE_APP_PACKETS = st.number_input('SOURCE_APP_PACKETS', min_value=0, max_value=103, value=50, help='Packets sent from the honeypot to the server') REMOTE_APP_PACKETS = st.number_input('REMOTE_APP_PACKETS', min_value=0, max_value=99, value=50, help='Packets received from the server') APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50, help='This is the total number of IP packets generated during the communication between the honeypot and the server') DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=20, value=0, help='This is the number of DNS packets generated during the communication between the honeypot and the server') #submit buttion submitted = st.form_submit_button('Predict') data_inf = { 'URL_LENGTH': URL_LENGTH, 'NUMBER_SPECIAL_CHARACTERS': NUMBER_SPECIAL_CHARACTERS, 'CONTENT_LENGTH': CONTENT_LENGTH, 'WHOIS_REGDATE': WHOIS_REGDATE, 'WHOIS_UPDATED_DATE': WHOIS_UPDATED_DATE, 'TCP_CONVERSATION_EXCHANGE': TCP_CONVERSATION_EXCHANGE, 'DIST_REMOTE_TCP_PORT': DIST_REMOTE_TCP_PORT, 'REMOTE_IPS': REMOTE_IPS, 'APP_BYTES': APP_BYTES, 'SOURCE_APP_PACKETS': SOURCE_APP_PACKETS, 'REMOTE_APP_PACKETS': REMOTE_APP_PACKETS, 'SOURCE_APP_BYTES': SOURCE_APP_BYTES, 'REMOTE_APP_BYTES': REMOTE_APP_BYTES, 'APP_PACKETS': APP_PACKETS, 'DNS_QUERY_TIMES': DNS_QUERY_TIMES, 'CHARSET': CHARSET, 'SERVER': SERVER, 'WHOIS_COUNTRY': WHOIS_COUNTRY, 'WHOIS_STATEPRO': WHOIS_STATEPRO } data_inf = pd.DataFrame([data_inf]) # st.dataframe(data_inf) def encode_and_create_dataframe_train(df, column): # Fit a separate OneHotEncoder for the column transformed_data = encoder.fit_transform(df[[column]]) # Get feature names for the column feature_names = encoder.get_feature_names_out(input_features=[column]) # Create a DataFrame for the column transformed_df = pd.DataFrame(transformed_data.toarray(), index=df.index, columns=feature_names) return transformed_df, encoder # logic ketika user submit if submitted: #split between numerical and categorical columns data_inf_num = data_inf[['URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH', 'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE', 'TCP_CONVERSATION_EXCHANGE', 'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES', 'SOURCE_APP_PACKETS', 'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES', 'APP_PACKETS', 'DNS_QUERY_TIMES']] data_inf_cat = data_inf[['CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO']] # Convert to datetime format data_inf_num['WHOIS_REGDATE'] = pd.to_datetime(data_inf_num['WHOIS_REGDATE']) data_inf_num['WHOIS_UPDATED_DATE'] = pd.to_datetime(data_inf_num['WHOIS_UPDATED_DATE']) # Extract year as integer data_inf_num['WHOIS_REGDATE'] = data_inf_num['WHOIS_REGDATE'].dt.year data_inf_num['WHOIS_UPDATED_DATE'] = data_inf_num['WHOIS_UPDATED_DATE'].dt.year # scaling and encoding data_inf_num_scaled = scaler.transform(data_inf_num) # transform to dataframe data_inf_num_scaled = pd.DataFrame(data_inf_num_scaled, columns=data_inf_num.columns) capped_CHARSET, ohe_CHARSET = encode_and_create_dataframe_train(data_inf_cat, 'CHARSET') capped_SERVER, ohe_SERVER = encode_and_create_dataframe_train(data_inf_cat, 'SERVER') capped_WHOIS_COUNTRY, ohe_WHOIS_COUNTRY = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_COUNTRY') capped_WHOIS_STATEPRO, ohe_WHOIS_STATEPRO = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_STATEPRO') # concat all data data_inf_final = pd.concat([data_inf_num_scaled, capped_CHARSET, capped_SERVER, capped_WHOIS_COUNTRY, capped_WHOIS_STATEPRO], axis=1) if len(column_names) != len(set(column_names)): st.write("column_names contains duplicates") if len(data_inf_final.columns) != len(set(data_inf_final.columns)): st.write("data_inf_final has duplicate column names") # reindex to match the training columns data_inf_final = data_inf_final.reindex(columns=column_names) # Check Missing Values data_inf_final.isnull().sum() # fill null value with zeros data_inf_final = data_inf_final.fillna(0) #predict using linear reg model y_pred_inf = model.predict(data_inf_final) st.dataframe(data_inf) if y_pred_inf == 0: # write with green color st.markdown("