Spaces:
Build error
Build error
File size: 5,209 Bytes
c2e943b 3b8c9cd c2e943b 3b8c9cd c2e943b 6f30b20 c2e943b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image
def app():
# title
st.title('Malicious or Benign Website Detection')
# subheader
st.subheader('EDA for Malicious or Benign Website Detection')
# add image
image = Image.open('web.jpg')
st.image(image, caption = 'Malicious or Benign Website')
# Markdown
st.markdown('----')
# Masukkan pandas dataframe
# show dataframe
df = pd.read_csv('dataset.csv')
st.dataframe(df)
# **Explanation directly taken from the website:**
# writing dataset explanation
st.write('#### Dataset Explanation')
st.write('''
- **URL**: It is the anonymous identification of the URL analyzed in the study.
- **URL_LENGTH**: It is the number of characters in the URL.
- **NUMBER_SPECIAL_CHARACTERS**: It is the number of special characters identified in the URL, such as, β/β, β%β, β#β, β&β, β. β, β=β.
- **CHARSET**: It is a categorical value and its meaning is the character encoding standard (also called character set).
- **SERVER**: It is a categorical value and its meaning is the operative system of the server got from the packet response.
- **CONTENT_LENGTH**: It represents the content size of the HTTP header.
- **WHOIS_COUNTRY**: It is a categorical variable, its values are the countries we got from the server response (specifically, our script used the API of Whois).
- **WHOIS_STATEPRO**: It is a categorical variable, its values are the states we got from the server response (specifically, our script used the API of Whois).
- **WHOIS_REGDATE**: Whois provides the server registration date, so, this variable has date values with format DD/MM/YYY HH:MM
- **WHOIS_UPDATED_DATE**: Through the Whois we got the last update date from the server analyzed.
- **TCP_CONVERSATION_EXCHANGE**: This variable is the number of TCP packets exchanged between the server and our honeypot client.
- **DIST_REMOTE_TCP_PORT**: It is the number of the ports detected and different to TCP.
- **REMOTE_IPS**: This variable has the total number of IPs connected to the honeypot.
- **APP_BYTES**: This is the number of bytes transferred.
- **SOURCE_APP_PACKETS**: Packets sent from the honeypot to the server.
- **REMOTE_APP_PACKETS**: Packets received from the server.
- **APP_PACKETS**: This is the total number of IP packets generated during the communication between the honeypot and the server.
- **DNS_QUERY_TIMES**: This is the number of DNS packets generated during the communication between the honeypot and the server.
- **TYPE**: This is a categorical variable, its values represent the type of web page analyzed, specifically, 1 is for malicious websites and 0 is for benign websites.
''')
object_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(exclude=['object']).columns
st.write('#### Plot Categorical Columns using Pie Chart')
option_cat = st.selectbox('Select Column:', ('CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO'))
fig = plt.figure(figsize=(15,5))
plt.pie(df[option_cat].value_counts(), labels=df[option_cat].value_counts().index, autopct='%1.1f%%', startangle=180)
st.pyplot(fig)
# # plot historical date data with lineplot for WHOIS_REGDATE and WHOIS_UPDATED_DATE separated by type column
# date_columns = ['WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']
# st.write('#### Plot Historical Date Data with Lineplot')
# option_date = st.selectbox('Select Column:', ('WHOIS_REGDATE', 'WHOIS_UPDATED_DATE'))
# fig = plt.figure(figsize=(15,5))
# sns.lineplot(x=option_date, y='Type', data=df)
# st.pyplot(fig)
st.write('#### Plot Numerical Columns')
option = st.selectbox('Select Column:', ('URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH', 'APP_PACKETS', 'DNS_QUERY_TIMES'))
fig = plt.figure(figsize=(15,5))
sns.histplot(df[option], bins=30, kde=True)
st.pyplot(fig)
option_pay = st.selectbox('Select Column:', ('TCP_CONVERSATION_EXCHANGE', 'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES'))
fig = plt.figure(figsize=(15,5))
sns.histplot(df[option_pay], bins=30, kde=True)
st.pyplot(fig)
option_bill_amt = st.selectbox('Select Column:', ('SOURCE_APP_PACKETS', 'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES'))
fig = plt.figure(figsize=(15,5))
sns.histplot(df[option_bill_amt], bins=30, kde=True)
st.pyplot(fig)
# plot type column count with boxplot color with type column
st.write('#### Plot Type Column Count with Boxplot')
fig = plt.figure(figsize=(15,5))
sns.boxplot(x='Type', y='URL_LENGTH', data=df, hue='Type')
st.pyplot(fig)
# Sort DataFrame by 'Type'
df = df.sort_values('Type')
# Membuat plotly plot
st.write('#### Plotly Plot - URL_LENGTH vs CONTENT_LENGTH')
fig = plt.figure(figsize=(15,5))
sns.scatterplot(x='URL_LENGTH', y='CONTENT_LENGTH', data=df, hue='Type')
st.pyplot(fig)
if __name__ == '__main__':
app() |