Spaces:

xyncz
/

malicious-website-detection

Build error

File size: 5,209 Bytes

import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image

def app():
    # title
    st.title('Malicious or Benign Website Detection')

    # subheader
    st.subheader('EDA for Malicious or Benign Website Detection')

    # add image
    image = Image.open('web.jpg')
    st.image(image, caption = 'Malicious or Benign Website')

    # Markdown
    st.markdown('----')

    # Masukkan pandas dataframe

    # show dataframe
    df = pd.read_csv('dataset.csv')
    st.dataframe(df)
    
    # **Explanation directly taken from the website:**
    
    # writing dataset explanation
    st.write('#### Dataset Explanation')
    st.write('''
    - **URL**: It is the anonymous identification of the URL analyzed in the study.
    - **URL_LENGTH**: It is the number of characters in the URL.
    - **NUMBER_SPECIAL_CHARACTERS**: It is the number of special characters identified in the URL, such as, “/”, “%”, “#”, “&”, “. “, “=”.
    - **CHARSET**: It is a categorical value and its meaning is the character encoding standard (also called character set).
    - **SERVER**: It is a categorical value and its meaning is the operative system of the server got from the packet response.
    - **CONTENT_LENGTH**: It represents the content size of the HTTP header.
    - **WHOIS_COUNTRY**: It is a categorical variable, its values are the countries we got from the server response (specifically, our script used the API of Whois).
    - **WHOIS_STATEPRO**: It is a categorical variable, its values are the states we got from the server response (specifically, our script used the API of Whois).
    - **WHOIS_REGDATE**: Whois provides the server registration date, so, this variable has date values with format DD/MM/YYY HH:MM
    - **WHOIS_UPDATED_DATE**: Through the Whois we got the last update date from the server analyzed.
    - **TCP_CONVERSATION_EXCHANGE**: This variable is the number of TCP packets exchanged between the server and our honeypot client.
    - **DIST_REMOTE_TCP_PORT**: It is the number of the ports detected and different to TCP.
    - **REMOTE_IPS**: This variable has the total number of IPs connected to the honeypot.
    - **APP_BYTES**: This is the number of bytes transferred.
    - **SOURCE_APP_PACKETS**: Packets sent from the honeypot to the server.
    - **REMOTE_APP_PACKETS**: Packets received from the server.
    - **APP_PACKETS**: This is the total number of IP packets generated during the communication between the honeypot and the server.
    - **DNS_QUERY_TIMES**: This is the number of DNS packets generated during the communication between the honeypot and the server.
    - **TYPE**: This is a categorical variable, its values represent the type of web page analyzed, specifically, 1 is for malicious websites and 0 is for benign websites.
    ''')
    
    object_columns = df.select_dtypes(include=['object']).columns
    numerical_columns = df.select_dtypes(exclude=['object']).columns

    st.write('#### Plot Categorical Columns using Pie Chart')
    option_cat = st.selectbox('Select Column:', ('CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO'))
    fig = plt.figure(figsize=(15,5))
    plt.pie(df[option_cat].value_counts(), labels=df[option_cat].value_counts().index, autopct='%1.1f%%', startangle=180)
    st.pyplot(fig)
    
    # # plot historical date data with lineplot for WHOIS_REGDATE and WHOIS_UPDATED_DATE separated by type column
    # date_columns = ['WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']
    # st.write('#### Plot Historical Date Data with Lineplot')
    # option_date = st.selectbox('Select Column:', ('WHOIS_REGDATE', 'WHOIS_UPDATED_DATE'))
    # fig = plt.figure(figsize=(15,5))
    # sns.lineplot(x=option_date, y='Type', data=df)
    # st.pyplot(fig)
        
    st.write('#### Plot Numerical Columns')
    option = st.selectbox('Select Column:', ('URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH', 'APP_PACKETS', 'DNS_QUERY_TIMES'))
    fig = plt.figure(figsize=(15,5))
    sns.histplot(df[option], bins=30, kde=True)
    st.pyplot(fig)
    
    option_pay = st.selectbox('Select Column:', ('TCP_CONVERSATION_EXCHANGE', 'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES'))
    fig = plt.figure(figsize=(15,5))
    sns.histplot(df[option_pay], bins=30, kde=True)
    st.pyplot(fig)
    
    option_bill_amt = st.selectbox('Select Column:', ('SOURCE_APP_PACKETS', 'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES'))
    fig = plt.figure(figsize=(15,5))
    sns.histplot(df[option_bill_amt], bins=30, kde=True)
    st.pyplot(fig)

    # plot type column count with boxplot color with type column
    st.write('#### Plot Type Column Count with Boxplot')
    fig = plt.figure(figsize=(15,5))
    sns.boxplot(x='Type', y='URL_LENGTH', data=df, hue='Type')
    st.pyplot(fig)

    # Sort DataFrame by 'Type'
    df = df.sort_values('Type')

    # Membuat plotly plot
    st.write('#### Plotly Plot - URL_LENGTH vs CONTENT_LENGTH')
    fig  = plt.figure(figsize=(15,5))
    sns.scatterplot(x='URL_LENGTH', y='CONTENT_LENGTH', data=df, hue='Type')
    st.pyplot(fig)
    

if __name__ == '__main__':
    app()