Spaces:

xyncz
/

malicious-website-detection

Build error

File size: 9,733 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import pickle

# load all files

with open("model.pkl", "rb") as f: # load the model
    model = pickle.load(f)
    
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open("encoder.pkl", "rb") as f: # load the scaler
    encoder = pickle.load(f)

with open('column_names.pkl', 'rb') as f:
    column_names = pickle.load(f)

def app():
    
    with st.form('from_website_data'):
        # write short description about the model
        st.write('''
        # **Malicious or Benign Website Detection**
        - The model used for this detection is `XGBoost` Classifier which Hyperparameter have been tuned.
        - The model also used `SMOTENC` to handle imbalanced data during training.
        - This model achieved `0.93` recall score on the test set to detect malicious website.
        ''')
        
        url = st.text_input('URL', 'https://www.google.com', help='The URL that will be analyzed')

        URL_LENGTH = len(url)

        NUMBER_SPECIAL_CHARACTERS = sum(not x.isalnum() for x in url)

        charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"}
                
        server_choice = {1: 'Apache', 2: 'cloudflare-nginx', 3: 'other', 4: 'Server', 5: 'GSE', 6: 'nginx', 7: 'unknown', 8: 'Microsoft-HTTPAPI/2.0', 9: 'nginx/1.8.0', 10: 'nginx/1.10.1', 11: 'Microsoft-IIS/7.5', 12: 'YouTubeFrontEnd', 13: 'Apache/2.2.22 (Debian)', 14: 'nginx/1.12.0', 15: 'Microsoft-IIS/6.0', 16: 'Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4', 17: 'Apache/2.2.14 (FreeBSD) mod_ssl/2.2.14 OpenSSL/0.9.8y DAV/2 PHP/5.2.12 with Suhosin-Patch'}
        
        whois_country_choice = {1: "AU", 2: "CA", 3: "ES", 4: "US", 5: "other", 6: "unknown", 7: "PA", 8: "FR", 9: "KR", 10: "CZ", 11: "JP", 12: "ru", 13: "UK", 14: "CN", 15: "GB", 16: "UY"}
                
        WHOIS_STATEPRO_choice = {1: "other", 2: "Barcelona", 3: "CA", 4: "NV", 5: "Washington", 6: "unknown", 7: "Arizona", 8: "UT", 9: "NY", 10: "ON", 11: "PA", 12: "FL", 13: "California", 14: "PRAHA", 15: "WA", 16: "Krasnoyarsk", 17: "Utah", 18: "WC1N"}

        CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()), help='The character encoding standard (also called character set)')

        SERVER = st.selectbox("Select Server", options=list(server_choice.values()), help='The operative system of the server got from the packet response')

        CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50, help='The content size of the HTTP header')

        WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()), help='The countries we got from the server response (specifically, our script used the API of Whois)')

        WHOIS_STATEPRO = st.selectbox("Select States", options=list(WHOIS_STATEPRO_choice.values()), help='The states we got from the server response (specifically, our script used the API of Whois)')

        WHOIS_REGDATE = st.date_input('WHOIS_REGDATE', help='Whois provides the server registration date')

        WHOIS_UPDATED_DATE = st.date_input('WHOIS_UPDATED_DATE', help='Through the Whois we got the last update date from the server analyzed')
        
        TCP_CONVERSATION_EXCHANGE = st.number_input('TCP_CONVERSATION_EXCHANGE', min_value=0, max_value=84, value=50, help='This variable is the number of TCP packets exchanged between the server and our honeypot client')

        DIST_REMOTE_TCP_PORT = st.number_input('DIST_REMOTE_TCP_PORT', min_value=0, max_value=20, value=0, help='It is the number of the ports detected and different to TCP')

        REMOTE_IPS = st.number_input('REMOTE_IPS', min_value=0, max_value=16, value=0, help='This variable has the total number of IPs connected to the honeypot')

        APP_BYTES = st.number_input('APP_BYTES', min_value=0, max_value=9302, value=50, help='This is the number of bytes transferred')

        REMOTE_APP_BYTES = st.number_input('REMOTE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes received from the server')

        SOURCE_APP_BYTES = st.number_input('SOURCE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes sent to the server')

        SOURCE_APP_PACKETS = st.number_input('SOURCE_APP_PACKETS', min_value=0, max_value=103, value=50, help='Packets sent from the honeypot to the server')

        REMOTE_APP_PACKETS = st.number_input('REMOTE_APP_PACKETS', min_value=0, max_value=99, value=50, help='Packets received from the server')

        APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50, help='This is the total number of IP packets generated during the communication between the honeypot and the server')

        DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=20, value=0, help='This is the number of DNS packets generated during the communication between the honeypot and the server')
        
        #submit buttion
        submitted = st.form_submit_button('Predict')
    
    data_inf = {
        'URL_LENGTH': URL_LENGTH,
        'NUMBER_SPECIAL_CHARACTERS': NUMBER_SPECIAL_CHARACTERS,
        'CONTENT_LENGTH': CONTENT_LENGTH,
        'WHOIS_REGDATE': WHOIS_REGDATE,
        'WHOIS_UPDATED_DATE': WHOIS_UPDATED_DATE,
        'TCP_CONVERSATION_EXCHANGE': TCP_CONVERSATION_EXCHANGE,
        'DIST_REMOTE_TCP_PORT': DIST_REMOTE_TCP_PORT,
        'REMOTE_IPS': REMOTE_IPS,
        'APP_BYTES': APP_BYTES,
        'SOURCE_APP_PACKETS': SOURCE_APP_PACKETS,
        'REMOTE_APP_PACKETS': REMOTE_APP_PACKETS,
        'SOURCE_APP_BYTES': SOURCE_APP_BYTES,
        'REMOTE_APP_BYTES': REMOTE_APP_BYTES,
        'APP_PACKETS': APP_PACKETS,
        'DNS_QUERY_TIMES': DNS_QUERY_TIMES,
        'CHARSET': CHARSET,
        'SERVER': SERVER,
        'WHOIS_COUNTRY': WHOIS_COUNTRY,
        'WHOIS_STATEPRO': WHOIS_STATEPRO
    }

    
    
    data_inf = pd.DataFrame([data_inf])
    # st.dataframe(data_inf)

    def encode_and_create_dataframe_train(df, column):
        # Fit a separate OneHotEncoder for the column
        transformed_data = encoder.fit_transform(df[[column]])

        # Get feature names for the column
        feature_names = encoder.get_feature_names_out(input_features=[column])

        # Create a DataFrame for the column
        transformed_df = pd.DataFrame(transformed_data.toarray(), 
                                    index=df.index, 
                                    columns=feature_names)
    
        return transformed_df, encoder

    # logic ketika user submit
    if submitted:
        #split between numerical and categorical columns
        data_inf_num = data_inf[['URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH', 
                                 'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE', 'TCP_CONVERSATION_EXCHANGE', 
                                 'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES', 'SOURCE_APP_PACKETS', 
                                 'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES', 'APP_PACKETS', 
                                 'DNS_QUERY_TIMES']]
        data_inf_cat = data_inf[['CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO']]
                
        # Convert to datetime format
        data_inf_num['WHOIS_REGDATE'] = pd.to_datetime(data_inf_num['WHOIS_REGDATE'])
        data_inf_num['WHOIS_UPDATED_DATE'] = pd.to_datetime(data_inf_num['WHOIS_UPDATED_DATE'])

        # Extract year as integer
        data_inf_num['WHOIS_REGDATE'] = data_inf_num['WHOIS_REGDATE'].dt.year
        data_inf_num['WHOIS_UPDATED_DATE'] = data_inf_num['WHOIS_UPDATED_DATE'].dt.year
        
        # scaling and encoding
        data_inf_num_scaled = scaler.transform(data_inf_num)
        
        # transform to dataframe
        data_inf_num_scaled = pd.DataFrame(data_inf_num_scaled, columns=data_inf_num.columns)

        capped_CHARSET, ohe_CHARSET = encode_and_create_dataframe_train(data_inf_cat, 'CHARSET')
        capped_SERVER, ohe_SERVER = encode_and_create_dataframe_train(data_inf_cat, 'SERVER')
        capped_WHOIS_COUNTRY, ohe_WHOIS_COUNTRY = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_COUNTRY')
        capped_WHOIS_STATEPRO, ohe_WHOIS_STATEPRO = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_STATEPRO')
        
        # concat all data 
        data_inf_final = pd.concat([data_inf_num_scaled, capped_CHARSET, capped_SERVER, capped_WHOIS_COUNTRY, capped_WHOIS_STATEPRO], axis=1)
        
        if len(column_names) != len(set(column_names)):
            st.write("column_names contains duplicates")
            
        if len(data_inf_final.columns) != len(set(data_inf_final.columns)):
            st.write("data_inf_final has duplicate column names")
                
        # reindex to match the training columns
        data_inf_final = data_inf_final.reindex(columns=column_names)

        # Check Missing Values
        data_inf_final.isnull().sum()

        # fill null value with zeros
        data_inf_final = data_inf_final.fillna(0)
        
        #predict using linear reg model

        y_pred_inf = model.predict(data_inf_final)
        
        st.dataframe(data_inf)
        
        if y_pred_inf == 0:
            # write with green color
            st.markdown("<h1 style='text-align: center; color: green;'>Predicted Class: Benign</h1>", unsafe_allow_html=True)        
        else:
            st.markdown("<h1 style='text-align: center; color: red;'>Predicted Class: Malicious</h1>", unsafe_allow_html=True)                
        
        
if __name__ == '__main__':
    app()