File size: 5,209 Bytes
c2e943b
 
 
 
 
 
 
 
 
3b8c9cd
c2e943b
 
3b8c9cd
c2e943b
 
 
 
 
 
 
 
 
 
 
 
 
 
6f30b20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2e943b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image

def app():
    # title
    st.title('Malicious or Benign Website Detection')

    # subheader
    st.subheader('EDA for Malicious or Benign Website Detection')

    # add image
    image = Image.open('web.jpg')
    st.image(image, caption = 'Malicious or Benign Website')

    # Markdown
    st.markdown('----')

    # Masukkan pandas dataframe

    # show dataframe
    df = pd.read_csv('dataset.csv')
    st.dataframe(df)
    
    # **Explanation directly taken from the website:**
    
    # writing dataset explanation
    st.write('#### Dataset Explanation')
    st.write('''
    - **URL**: It is the anonymous identification of the URL analyzed in the study.
    - **URL_LENGTH**: It is the number of characters in the URL.
    - **NUMBER_SPECIAL_CHARACTERS**: It is the number of special characters identified in the URL, such as, β€œ/”, β€œ%”, β€œ#”, β€œ&”, β€œ. β€œ, β€œ=”.
    - **CHARSET**: It is a categorical value and its meaning is the character encoding standard (also called character set).
    - **SERVER**: It is a categorical value and its meaning is the operative system of the server got from the packet response.
    - **CONTENT_LENGTH**: It represents the content size of the HTTP header.
    - **WHOIS_COUNTRY**: It is a categorical variable, its values are the countries we got from the server response (specifically, our script used the API of Whois).
    - **WHOIS_STATEPRO**: It is a categorical variable, its values are the states we got from the server response (specifically, our script used the API of Whois).
    - **WHOIS_REGDATE**: Whois provides the server registration date, so, this variable has date values with format DD/MM/YYY HH:MM
    - **WHOIS_UPDATED_DATE**: Through the Whois we got the last update date from the server analyzed.
    - **TCP_CONVERSATION_EXCHANGE**: This variable is the number of TCP packets exchanged between the server and our honeypot client.
    - **DIST_REMOTE_TCP_PORT**: It is the number of the ports detected and different to TCP.
    - **REMOTE_IPS**: This variable has the total number of IPs connected to the honeypot.
    - **APP_BYTES**: This is the number of bytes transferred.
    - **SOURCE_APP_PACKETS**: Packets sent from the honeypot to the server.
    - **REMOTE_APP_PACKETS**: Packets received from the server.
    - **APP_PACKETS**: This is the total number of IP packets generated during the communication between the honeypot and the server.
    - **DNS_QUERY_TIMES**: This is the number of DNS packets generated during the communication between the honeypot and the server.
    - **TYPE**: This is a categorical variable, its values represent the type of web page analyzed, specifically, 1 is for malicious websites and 0 is for benign websites.
    ''')
    
    object_columns = df.select_dtypes(include=['object']).columns
    numerical_columns = df.select_dtypes(exclude=['object']).columns

    st.write('#### Plot Categorical Columns using Pie Chart')
    option_cat = st.selectbox('Select Column:', ('CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO'))
    fig = plt.figure(figsize=(15,5))
    plt.pie(df[option_cat].value_counts(), labels=df[option_cat].value_counts().index, autopct='%1.1f%%', startangle=180)
    st.pyplot(fig)
    
    # # plot historical date data with lineplot for WHOIS_REGDATE and WHOIS_UPDATED_DATE separated by type column
    # date_columns = ['WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']
    # st.write('#### Plot Historical Date Data with Lineplot')
    # option_date = st.selectbox('Select Column:', ('WHOIS_REGDATE', 'WHOIS_UPDATED_DATE'))
    # fig = plt.figure(figsize=(15,5))
    # sns.lineplot(x=option_date, y='Type', data=df)
    # st.pyplot(fig)
        
    st.write('#### Plot Numerical Columns')
    option = st.selectbox('Select Column:', ('URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH', 'APP_PACKETS', 'DNS_QUERY_TIMES'))
    fig = plt.figure(figsize=(15,5))
    sns.histplot(df[option], bins=30, kde=True)
    st.pyplot(fig)
    
    option_pay = st.selectbox('Select Column:', ('TCP_CONVERSATION_EXCHANGE', 'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES'))
    fig = plt.figure(figsize=(15,5))
    sns.histplot(df[option_pay], bins=30, kde=True)
    st.pyplot(fig)
    
    option_bill_amt = st.selectbox('Select Column:', ('SOURCE_APP_PACKETS', 'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES'))
    fig = plt.figure(figsize=(15,5))
    sns.histplot(df[option_bill_amt], bins=30, kde=True)
    st.pyplot(fig)

    # plot type column count with boxplot color with type column
    st.write('#### Plot Type Column Count with Boxplot')
    fig = plt.figure(figsize=(15,5))
    sns.boxplot(x='Type', y='URL_LENGTH', data=df, hue='Type')
    st.pyplot(fig)

    # Sort DataFrame by 'Type'
    df = df.sort_values('Type')

    # Membuat plotly plot
    st.write('#### Plotly Plot - URL_LENGTH vs CONTENT_LENGTH')
    fig  = plt.figure(figsize=(15,5))
    sns.scatterplot(x='URL_LENGTH', y='CONTENT_LENGTH', data=df, hue='Type')
    st.pyplot(fig)
    

if __name__ == '__main__':
    app()