Spaces:
Build error
Build error
Upload 10 files
Browse files- app.py +11 -0
- column_names.pkl +3 -0
- dataset.csv +0 -0
- eda.py +78 -0
- encoder.pkl +3 -0
- model.pkl +3 -0
- prediction.py +203 -0
- requirements.txt +9 -0
- scaler.pkl +3 -0
- web.jpg +0 -0
app.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import eda
|
| 3 |
+
import prediction
|
| 4 |
+
|
| 5 |
+
page = st.sidebar.selectbox('Select Page: ', ('EDA', 'Prediction'))
|
| 6 |
+
|
| 7 |
+
if page == 'EDA':
|
| 8 |
+
eda.app()
|
| 9 |
+
else:
|
| 10 |
+
prediction.app()
|
| 11 |
+
|
column_names.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff2e2f25fd401dc4c2cdfd6a49eb8fe51a4d515fba9fe064e6b194318921fb29
|
| 3 |
+
size 1406
|
dataset.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eda.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import seaborn as sns
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import plotly.express as px
|
| 6 |
+
from PIL import Image
|
| 7 |
+
|
| 8 |
+
def app():
|
| 9 |
+
# title
|
| 10 |
+
st.title('Malicious or Benign Website Prediction')
|
| 11 |
+
|
| 12 |
+
# subheader
|
| 13 |
+
st.subheader('EDA for Malicious or Benign Website Prediction')
|
| 14 |
+
|
| 15 |
+
# add image
|
| 16 |
+
image = Image.open('web.jpg')
|
| 17 |
+
st.image(image, caption = 'Malicious or Benign Website')
|
| 18 |
+
|
| 19 |
+
# Markdown
|
| 20 |
+
st.markdown('----')
|
| 21 |
+
|
| 22 |
+
# Masukkan pandas dataframe
|
| 23 |
+
|
| 24 |
+
# show dataframe
|
| 25 |
+
df = pd.read_csv('dataset.csv')
|
| 26 |
+
st.dataframe(df)
|
| 27 |
+
|
| 28 |
+
object_columns = df.select_dtypes(include=['object']).columns
|
| 29 |
+
numerical_columns = df.select_dtypes(exclude=['object']).columns
|
| 30 |
+
|
| 31 |
+
st.write('#### Plot Categorical Columns using Pie Chart')
|
| 32 |
+
option_cat = st.selectbox('Select Column:', ('CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO'))
|
| 33 |
+
fig = plt.figure(figsize=(15,5))
|
| 34 |
+
plt.pie(df[option_cat].value_counts(), labels=df[option_cat].value_counts().index, autopct='%1.1f%%', startangle=180)
|
| 35 |
+
st.pyplot(fig)
|
| 36 |
+
|
| 37 |
+
# # plot historical date data with lineplot for WHOIS_REGDATE and WHOIS_UPDATED_DATE separated by type column
|
| 38 |
+
# date_columns = ['WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']
|
| 39 |
+
# st.write('#### Plot Historical Date Data with Lineplot')
|
| 40 |
+
# option_date = st.selectbox('Select Column:', ('WHOIS_REGDATE', 'WHOIS_UPDATED_DATE'))
|
| 41 |
+
# fig = plt.figure(figsize=(15,5))
|
| 42 |
+
# sns.lineplot(x=option_date, y='Type', data=df)
|
| 43 |
+
# st.pyplot(fig)
|
| 44 |
+
|
| 45 |
+
st.write('#### Plot Numerical Columns')
|
| 46 |
+
option = st.selectbox('Select Column:', ('URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH', 'APP_PACKETS', 'DNS_QUERY_TIMES'))
|
| 47 |
+
fig = plt.figure(figsize=(15,5))
|
| 48 |
+
sns.histplot(df[option], bins=30, kde=True)
|
| 49 |
+
st.pyplot(fig)
|
| 50 |
+
|
| 51 |
+
option_pay = st.selectbox('Select Column:', ('TCP_CONVERSATION_EXCHANGE', 'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES'))
|
| 52 |
+
fig = plt.figure(figsize=(15,5))
|
| 53 |
+
sns.histplot(df[option_pay], bins=30, kde=True)
|
| 54 |
+
st.pyplot(fig)
|
| 55 |
+
|
| 56 |
+
option_bill_amt = st.selectbox('Select Column:', ('SOURCE_APP_PACKETS', 'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES'))
|
| 57 |
+
fig = plt.figure(figsize=(15,5))
|
| 58 |
+
sns.histplot(df[option_bill_amt], bins=30, kde=True)
|
| 59 |
+
st.pyplot(fig)
|
| 60 |
+
|
| 61 |
+
# plot type column count with boxplot color with type column
|
| 62 |
+
st.write('#### Plot Type Column Count with Boxplot')
|
| 63 |
+
fig = plt.figure(figsize=(15,5))
|
| 64 |
+
sns.boxplot(x='Type', y='URL_LENGTH', data=df, hue='Type')
|
| 65 |
+
st.pyplot(fig)
|
| 66 |
+
|
| 67 |
+
# Sort DataFrame by 'Type'
|
| 68 |
+
df = df.sort_values('Type')
|
| 69 |
+
|
| 70 |
+
# Membuat plotly plot
|
| 71 |
+
st.write('#### Plotly Plot - URL_LENGTH vs CONTENT_LENGTH')
|
| 72 |
+
fig = plt.figure(figsize=(15,5))
|
| 73 |
+
sns.scatterplot(x='URL_LENGTH', y='CONTENT_LENGTH', data=df, hue='Type')
|
| 74 |
+
st.pyplot(fig)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
if __name__ == '__main__':
|
| 78 |
+
app()
|
encoder.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4781c04a1bac238f9ae766bd4588248a6594d41754ce98e101281915c96de98
|
| 3 |
+
size 659
|
model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b2497b428c40346d3449e91bd4521b187a16122c5c9d8835e5dae01f785f6ca
|
| 3 |
+
size 139783
|
prediction.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pickle
|
| 5 |
+
|
| 6 |
+
# load all files
|
| 7 |
+
|
| 8 |
+
with open("model.pkl", "rb") as f: # load the model
|
| 9 |
+
model = pickle.load(f)
|
| 10 |
+
|
| 11 |
+
with open("scaler.pkl", "rb") as f:
|
| 12 |
+
scaler = pickle.load(f)
|
| 13 |
+
|
| 14 |
+
with open("encoder.pkl", "rb") as f: # load the scaler
|
| 15 |
+
encoder = pickle.load(f)
|
| 16 |
+
|
| 17 |
+
with open('column_names.pkl', 'rb') as f:
|
| 18 |
+
column_names = pickle.load(f)
|
| 19 |
+
|
| 20 |
+
# 'URL_LENGTH': URL_LENGTH,
|
| 21 |
+
# 'NUMBER_SPECIAL_CHARACTERS': NUMBER_SPECIAL_CHARACTERS,
|
| 22 |
+
# 'CONTENT_LENGTH': CONTENT_LENGTH,
|
| 23 |
+
# 'WHOIS_REGDATE': WHOIS_REGDATE,
|
| 24 |
+
# 'WHOIS_UPDATED_DATE': WHOIS_UPDATED_DATE,
|
| 25 |
+
# 'TCP_CONVERSATION_EXCHANGE': TCP_CONVERSATION_EXCHANGE,
|
| 26 |
+
# 'DIST_REMOTE_TCP_PORT': DIST_REMOTE_TCP_PORT,
|
| 27 |
+
# 'REMOTE_IPS': REMOTE_IPS,
|
| 28 |
+
# 'APP_BYTES': APP_BYTES,
|
| 29 |
+
# 'SOURCE_APP_PACKETS': SOURCE_APP_PACKETS,
|
| 30 |
+
# 'REMOTE_APP_PACKETS': REMOTE_APP_PACKETS,
|
| 31 |
+
# 'SOURCE_APP_BYTES': SOURCE_APP_BYTES,
|
| 32 |
+
# 'REMOTE_APP_BYTES': REMOTE_APP_BYTES,
|
| 33 |
+
# 'APP_PACKETS': APP_PACKETS,
|
| 34 |
+
# 'DNS_QUERY_TIMES': DNS_QUERY_TIMES
|
| 35 |
+
|
| 36 |
+
#INT BLOCK
|
| 37 |
+
# URL_LENGTH = np.random.randint(16.000000, 159.000000, size=10)
|
| 38 |
+
# NUMBER_SPECIAL_CHARACTERS = np.random.randint(5.000000, 28.000000, size=10)
|
| 39 |
+
# CONTENT_LENGTH = np.random.randint(0, 9806.000000, size=10)
|
| 40 |
+
|
| 41 |
+
# d1 = datetime.strptime('1990-07-26', '%Y-%m-%d')
|
| 42 |
+
# d2 = datetime.strptime('2017-04-14', '%Y-%m-%d')
|
| 43 |
+
# WHOIS_UPDATED_DATE = random_date(d1, d2)
|
| 44 |
+
# WHOIS_REGDATE = random_date(d1, d2)
|
| 45 |
+
|
| 46 |
+
# TCP_CONVERSATION_EXCHANGE = np.random.randint(0, 84.000000, size=10)
|
| 47 |
+
# DIST_REMOTE_TCP_PORT = np.random.randint(0, 20.000000, size=10)
|
| 48 |
+
# REMOTE_IPS = np.random.randint(0, 16, size=10)
|
| 49 |
+
# APP_BYTES = np.random.randint(0, 9302, size=10)
|
| 50 |
+
|
| 51 |
+
# SOURCE_APP_PACKETS = np.random.randint(0, 103, size=10)
|
| 52 |
+
# REMOTE_APP_PACKETS = np.random.randint(0, 99, size=10)
|
| 53 |
+
# SOURCE_APP_BYTES = np.random.randint(0, 38681, size=10)
|
| 54 |
+
# REMOTE_APP_BYTES = np.random.randint(0, 10693, size=10)
|
| 55 |
+
# APP_PACKETS = np.random.randint(0, 103, size=10)
|
| 56 |
+
# DNS_QUERY_TIMES = np.random.randint(0, 14, size=10)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def app():
|
| 60 |
+
|
| 61 |
+
with st.form('from_website_data'):
|
| 62 |
+
|
| 63 |
+
charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"}
|
| 64 |
+
|
| 65 |
+
CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()))
|
| 66 |
+
|
| 67 |
+
server_choice = {1: "other", 2: "Apache", 3: "nginx", 4: "cloudflare-nginx", 5: "nginx/1.12.0", 6: "Apache/2.2.22 (Debian)", 7: "nginx/1.8.0", 8: "nginx/1.10.1", 9: "Microsoft-HTTPAPI/2.0", 10: "Microsoft-IIS/6.0", 11: "Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4"}
|
| 68 |
+
|
| 69 |
+
SERVER = st.selectbox("Select Server", options=list(server_choice.values()))
|
| 70 |
+
|
| 71 |
+
whois_country_choice = {1: "US", 2: "other", 3: "unknown", 4: "PA", 5: "GB", 6: "CN", 7: "KR", 8: "CA", 9: "UK", 10: "CZ", 11: "FR"}
|
| 72 |
+
|
| 73 |
+
WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()))
|
| 74 |
+
|
| 75 |
+
WHOIS_STATEPRO_choice = {1: "other", 2: "CA", 3: "unknown", 4: "California", 5: "PA", 6: "Washington", 7: "Arizona", 8: "ON", 9: "WA", 10: "FL"}
|
| 76 |
+
|
| 77 |
+
WHOIS_STATEPRO = st.selectbox("Select States", options=list(WHOIS_STATEPRO_choice.values()))
|
| 78 |
+
|
| 79 |
+
URL_LENGTH = st.number_input('URL_LENGTH', min_value=16, max_value=159, value=50)
|
| 80 |
+
NUMBER_SPECIAL_CHARACTERS = st.number_input('NUMBER_SPECIAL_CHARACTERS', min_value=5, max_value=28, value=5)
|
| 81 |
+
CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50)
|
| 82 |
+
TCP_CONVERSATION_EXCHANGE = st.number_input('TCP_CONVERSATION_EXCHANGE', min_value=0, max_value=84, value=50)
|
| 83 |
+
DIST_REMOTE_TCP_PORT = st.number_input('DIST_REMOTE_TCP_PORT', min_value=0, max_value=20, value=0)
|
| 84 |
+
REMOTE_IPS = st.number_input('REMOTE_IPS', min_value=0, max_value=16, value=0)
|
| 85 |
+
APP_BYTES = st.number_input('APP_BYTES', min_value=0, max_value=9302, value=50)
|
| 86 |
+
SOURCE_APP_PACKETS = st.number_input('SOURCE_APP_PACKETS', min_value=0, max_value=103, value=50)
|
| 87 |
+
REMOTE_APP_PACKETS = st.number_input('REMOTE_APP_PACKETS', min_value=0, max_value=99, value=50)
|
| 88 |
+
SOURCE_APP_BYTES = st.number_input('SOURCE_APP_BYTES', min_value=0, max_value=38681, value=50)
|
| 89 |
+
REMOTE_APP_BYTES = st.number_input('REMOTE_APP_BYTES', min_value=0, max_value=10693, value=50)
|
| 90 |
+
APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50)
|
| 91 |
+
DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=14, value=5)
|
| 92 |
+
|
| 93 |
+
# input for date
|
| 94 |
+
# 2006-03-22
|
| 95 |
+
WHOIS_REGDATE = st.date_input('WHOIS_REGDATE', format="YYYY-MM-DD")
|
| 96 |
+
WHOIS_UPDATED_DATE = st.date_input('WHOIS_UPDATED_DATE', format="YYYY-MM-DD")
|
| 97 |
+
|
| 98 |
+
#submit buttion
|
| 99 |
+
submitted = st.form_submit_button('Predict')
|
| 100 |
+
|
| 101 |
+
data_inf = {
|
| 102 |
+
'URL_LENGTH': URL_LENGTH,
|
| 103 |
+
'NUMBER_SPECIAL_CHARACTERS': NUMBER_SPECIAL_CHARACTERS,
|
| 104 |
+
'CONTENT_LENGTH': CONTENT_LENGTH,
|
| 105 |
+
'WHOIS_REGDATE': WHOIS_REGDATE,
|
| 106 |
+
'WHOIS_UPDATED_DATE': WHOIS_UPDATED_DATE,
|
| 107 |
+
'TCP_CONVERSATION_EXCHANGE': TCP_CONVERSATION_EXCHANGE,
|
| 108 |
+
'DIST_REMOTE_TCP_PORT': DIST_REMOTE_TCP_PORT,
|
| 109 |
+
'REMOTE_IPS': REMOTE_IPS,
|
| 110 |
+
'APP_BYTES': APP_BYTES,
|
| 111 |
+
'SOURCE_APP_PACKETS': SOURCE_APP_PACKETS,
|
| 112 |
+
'REMOTE_APP_PACKETS': REMOTE_APP_PACKETS,
|
| 113 |
+
'SOURCE_APP_BYTES': SOURCE_APP_BYTES,
|
| 114 |
+
'REMOTE_APP_BYTES': REMOTE_APP_BYTES,
|
| 115 |
+
'APP_PACKETS': APP_PACKETS,
|
| 116 |
+
'DNS_QUERY_TIMES': DNS_QUERY_TIMES,
|
| 117 |
+
'CHARSET': CHARSET,
|
| 118 |
+
'SERVER': SERVER,
|
| 119 |
+
'WHOIS_COUNTRY': WHOIS_COUNTRY,
|
| 120 |
+
'WHOIS_STATEPRO': WHOIS_STATEPRO
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
data_inf = pd.DataFrame([data_inf])
|
| 126 |
+
# st.dataframe(data_inf)
|
| 127 |
+
|
| 128 |
+
def encode_and_create_dataframe_train(df, column):
|
| 129 |
+
# Fit a separate OneHotEncoder for the column
|
| 130 |
+
transformed_data = encoder.fit_transform(df[[column]])
|
| 131 |
+
|
| 132 |
+
# Get feature names for the column
|
| 133 |
+
feature_names = encoder.get_feature_names_out(input_features=[column])
|
| 134 |
+
|
| 135 |
+
# Create a DataFrame for the column
|
| 136 |
+
transformed_df = pd.DataFrame(transformed_data.toarray(),
|
| 137 |
+
index=df.index,
|
| 138 |
+
columns=feature_names)
|
| 139 |
+
|
| 140 |
+
return transformed_df, encoder
|
| 141 |
+
|
| 142 |
+
# logic ketika user submit
|
| 143 |
+
if submitted:
|
| 144 |
+
#split between numerical and categorical columns
|
| 145 |
+
data_inf_num = data_inf[['URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH',
|
| 146 |
+
'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE', 'TCP_CONVERSATION_EXCHANGE',
|
| 147 |
+
'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES', 'SOURCE_APP_PACKETS',
|
| 148 |
+
'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES', 'APP_PACKETS',
|
| 149 |
+
'DNS_QUERY_TIMES']]
|
| 150 |
+
data_inf_cat = data_inf[['CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO']]
|
| 151 |
+
|
| 152 |
+
# Convert to datetime format
|
| 153 |
+
data_inf_num['WHOIS_REGDATE'] = pd.to_datetime(data_inf_num['WHOIS_REGDATE'])
|
| 154 |
+
data_inf_num['WHOIS_UPDATED_DATE'] = pd.to_datetime(data_inf_num['WHOIS_UPDATED_DATE'])
|
| 155 |
+
|
| 156 |
+
# Extract year as integer
|
| 157 |
+
data_inf_num['WHOIS_REGDATE'] = data_inf_num['WHOIS_REGDATE'].dt.year
|
| 158 |
+
data_inf_num['WHOIS_UPDATED_DATE'] = data_inf_num['WHOIS_UPDATED_DATE'].dt.year
|
| 159 |
+
|
| 160 |
+
# scaling and encoding
|
| 161 |
+
data_inf_num_scaled = scaler.transform(data_inf_num)
|
| 162 |
+
|
| 163 |
+
# transform to dataframe
|
| 164 |
+
data_inf_num_scaled = pd.DataFrame(data_inf_num_scaled, columns=data_inf_num.columns)
|
| 165 |
+
|
| 166 |
+
capped_CHARSET, ohe_CHARSET = encode_and_create_dataframe_train(data_inf_cat, 'CHARSET')
|
| 167 |
+
capped_SERVER, ohe_SERVER = encode_and_create_dataframe_train(data_inf_cat, 'SERVER')
|
| 168 |
+
capped_WHOIS_COUNTRY, ohe_WHOIS_COUNTRY = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_COUNTRY')
|
| 169 |
+
capped_WHOIS_STATEPRO, ohe_WHOIS_STATEPRO = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_STATEPRO')
|
| 170 |
+
|
| 171 |
+
# concat all data
|
| 172 |
+
data_inf_final = pd.concat([data_inf_num_scaled, capped_CHARSET, capped_SERVER, capped_WHOIS_COUNTRY, capped_WHOIS_STATEPRO], axis=1)
|
| 173 |
+
|
| 174 |
+
if len(column_names) != len(set(column_names)):
|
| 175 |
+
st.write("column_names contains duplicates")
|
| 176 |
+
|
| 177 |
+
if len(data_inf_final.columns) != len(set(data_inf_final.columns)):
|
| 178 |
+
st.write("data_inf_final has duplicate column names")
|
| 179 |
+
|
| 180 |
+
# reindex to match the training columns
|
| 181 |
+
data_inf_final = data_inf_final.reindex(columns=column_names)
|
| 182 |
+
|
| 183 |
+
# Check Missing Values
|
| 184 |
+
data_inf_final.isnull().sum()
|
| 185 |
+
|
| 186 |
+
# fill null value with zeros
|
| 187 |
+
data_inf_final = data_inf_final.fillna(0)
|
| 188 |
+
|
| 189 |
+
#predict using linear reg model
|
| 190 |
+
|
| 191 |
+
y_pred_inf = model.predict(data_inf_final)
|
| 192 |
+
|
| 193 |
+
st.dataframe(data_inf)
|
| 194 |
+
|
| 195 |
+
if y_pred_inf == 0:
|
| 196 |
+
# write with green color
|
| 197 |
+
st.markdown("<h1 style='text-align: center; color: green;'>Predicted Class: Benign</h1>", unsafe_allow_html=True)
|
| 198 |
+
else:
|
| 199 |
+
st.markdown("<h1 style='text-align: center; color: red;'>Predicted Class: Malicious</h1>", unsafe_allow_html=True)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
if __name__ == '__main__':
|
| 203 |
+
app()
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
pandas
|
| 3 |
+
seaborn
|
| 4 |
+
matplotlib
|
| 5 |
+
numpy
|
| 6 |
+
plotly
|
| 7 |
+
Pillow
|
| 8 |
+
xgboost
|
| 9 |
+
scikit-learn==1.2.2
|
scaler.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:861784cd69b250f457bfbf7b7ef84c51d1afa037fc952e546886c2e703464b0f
|
| 3 |
+
size 1003
|
web.jpg
ADDED
|