xyncz commited on
Commit
c2e943b
·
1 Parent(s): dd0416b

Upload 10 files

Browse files
Files changed (10) hide show
  1. app.py +11 -0
  2. column_names.pkl +3 -0
  3. dataset.csv +0 -0
  4. eda.py +78 -0
  5. encoder.pkl +3 -0
  6. model.pkl +3 -0
  7. prediction.py +203 -0
  8. requirements.txt +9 -0
  9. scaler.pkl +3 -0
  10. web.jpg +0 -0
app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import eda
3
+ import prediction
4
+
5
+ page = st.sidebar.selectbox('Select Page: ', ('EDA', 'Prediction'))
6
+
7
+ if page == 'EDA':
8
+ eda.app()
9
+ else:
10
+ prediction.app()
11
+
column_names.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff2e2f25fd401dc4c2cdfd6a49eb8fe51a4d515fba9fe064e6b194318921fb29
3
+ size 1406
dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
eda.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ import plotly.express as px
6
+ from PIL import Image
7
+
8
+ def app():
9
+ # title
10
+ st.title('Malicious or Benign Website Prediction')
11
+
12
+ # subheader
13
+ st.subheader('EDA for Malicious or Benign Website Prediction')
14
+
15
+ # add image
16
+ image = Image.open('web.jpg')
17
+ st.image(image, caption = 'Malicious or Benign Website')
18
+
19
+ # Markdown
20
+ st.markdown('----')
21
+
22
+ # Masukkan pandas dataframe
23
+
24
+ # show dataframe
25
+ df = pd.read_csv('dataset.csv')
26
+ st.dataframe(df)
27
+
28
+ object_columns = df.select_dtypes(include=['object']).columns
29
+ numerical_columns = df.select_dtypes(exclude=['object']).columns
30
+
31
+ st.write('#### Plot Categorical Columns using Pie Chart')
32
+ option_cat = st.selectbox('Select Column:', ('CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO'))
33
+ fig = plt.figure(figsize=(15,5))
34
+ plt.pie(df[option_cat].value_counts(), labels=df[option_cat].value_counts().index, autopct='%1.1f%%', startangle=180)
35
+ st.pyplot(fig)
36
+
37
+ # # plot historical date data with lineplot for WHOIS_REGDATE and WHOIS_UPDATED_DATE separated by type column
38
+ # date_columns = ['WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']
39
+ # st.write('#### Plot Historical Date Data with Lineplot')
40
+ # option_date = st.selectbox('Select Column:', ('WHOIS_REGDATE', 'WHOIS_UPDATED_DATE'))
41
+ # fig = plt.figure(figsize=(15,5))
42
+ # sns.lineplot(x=option_date, y='Type', data=df)
43
+ # st.pyplot(fig)
44
+
45
+ st.write('#### Plot Numerical Columns')
46
+ option = st.selectbox('Select Column:', ('URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH', 'APP_PACKETS', 'DNS_QUERY_TIMES'))
47
+ fig = plt.figure(figsize=(15,5))
48
+ sns.histplot(df[option], bins=30, kde=True)
49
+ st.pyplot(fig)
50
+
51
+ option_pay = st.selectbox('Select Column:', ('TCP_CONVERSATION_EXCHANGE', 'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES'))
52
+ fig = plt.figure(figsize=(15,5))
53
+ sns.histplot(df[option_pay], bins=30, kde=True)
54
+ st.pyplot(fig)
55
+
56
+ option_bill_amt = st.selectbox('Select Column:', ('SOURCE_APP_PACKETS', 'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES'))
57
+ fig = plt.figure(figsize=(15,5))
58
+ sns.histplot(df[option_bill_amt], bins=30, kde=True)
59
+ st.pyplot(fig)
60
+
61
+ # plot type column count with boxplot color with type column
62
+ st.write('#### Plot Type Column Count with Boxplot')
63
+ fig = plt.figure(figsize=(15,5))
64
+ sns.boxplot(x='Type', y='URL_LENGTH', data=df, hue='Type')
65
+ st.pyplot(fig)
66
+
67
+ # Sort DataFrame by 'Type'
68
+ df = df.sort_values('Type')
69
+
70
+ # Membuat plotly plot
71
+ st.write('#### Plotly Plot - URL_LENGTH vs CONTENT_LENGTH')
72
+ fig = plt.figure(figsize=(15,5))
73
+ sns.scatterplot(x='URL_LENGTH', y='CONTENT_LENGTH', data=df, hue='Type')
74
+ st.pyplot(fig)
75
+
76
+
77
+ if __name__ == '__main__':
78
+ app()
encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4781c04a1bac238f9ae766bd4588248a6594d41754ce98e101281915c96de98
3
+ size 659
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b2497b428c40346d3449e91bd4521b187a16122c5c9d8835e5dae01f785f6ca
3
+ size 139783
prediction.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import pickle
5
+
6
+ # load all files
7
+
8
+ with open("model.pkl", "rb") as f: # load the model
9
+ model = pickle.load(f)
10
+
11
+ with open("scaler.pkl", "rb") as f:
12
+ scaler = pickle.load(f)
13
+
14
+ with open("encoder.pkl", "rb") as f: # load the scaler
15
+ encoder = pickle.load(f)
16
+
17
+ with open('column_names.pkl', 'rb') as f:
18
+ column_names = pickle.load(f)
19
+
20
+ # 'URL_LENGTH': URL_LENGTH,
21
+ # 'NUMBER_SPECIAL_CHARACTERS': NUMBER_SPECIAL_CHARACTERS,
22
+ # 'CONTENT_LENGTH': CONTENT_LENGTH,
23
+ # 'WHOIS_REGDATE': WHOIS_REGDATE,
24
+ # 'WHOIS_UPDATED_DATE': WHOIS_UPDATED_DATE,
25
+ # 'TCP_CONVERSATION_EXCHANGE': TCP_CONVERSATION_EXCHANGE,
26
+ # 'DIST_REMOTE_TCP_PORT': DIST_REMOTE_TCP_PORT,
27
+ # 'REMOTE_IPS': REMOTE_IPS,
28
+ # 'APP_BYTES': APP_BYTES,
29
+ # 'SOURCE_APP_PACKETS': SOURCE_APP_PACKETS,
30
+ # 'REMOTE_APP_PACKETS': REMOTE_APP_PACKETS,
31
+ # 'SOURCE_APP_BYTES': SOURCE_APP_BYTES,
32
+ # 'REMOTE_APP_BYTES': REMOTE_APP_BYTES,
33
+ # 'APP_PACKETS': APP_PACKETS,
34
+ # 'DNS_QUERY_TIMES': DNS_QUERY_TIMES
35
+
36
+ #INT BLOCK
37
+ # URL_LENGTH = np.random.randint(16.000000, 159.000000, size=10)
38
+ # NUMBER_SPECIAL_CHARACTERS = np.random.randint(5.000000, 28.000000, size=10)
39
+ # CONTENT_LENGTH = np.random.randint(0, 9806.000000, size=10)
40
+
41
+ # d1 = datetime.strptime('1990-07-26', '%Y-%m-%d')
42
+ # d2 = datetime.strptime('2017-04-14', '%Y-%m-%d')
43
+ # WHOIS_UPDATED_DATE = random_date(d1, d2)
44
+ # WHOIS_REGDATE = random_date(d1, d2)
45
+
46
+ # TCP_CONVERSATION_EXCHANGE = np.random.randint(0, 84.000000, size=10)
47
+ # DIST_REMOTE_TCP_PORT = np.random.randint(0, 20.000000, size=10)
48
+ # REMOTE_IPS = np.random.randint(0, 16, size=10)
49
+ # APP_BYTES = np.random.randint(0, 9302, size=10)
50
+
51
+ # SOURCE_APP_PACKETS = np.random.randint(0, 103, size=10)
52
+ # REMOTE_APP_PACKETS = np.random.randint(0, 99, size=10)
53
+ # SOURCE_APP_BYTES = np.random.randint(0, 38681, size=10)
54
+ # REMOTE_APP_BYTES = np.random.randint(0, 10693, size=10)
55
+ # APP_PACKETS = np.random.randint(0, 103, size=10)
56
+ # DNS_QUERY_TIMES = np.random.randint(0, 14, size=10)
57
+
58
+
59
+ def app():
60
+
61
+ with st.form('from_website_data'):
62
+
63
+ charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"}
64
+
65
+ CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()))
66
+
67
+ server_choice = {1: "other", 2: "Apache", 3: "nginx", 4: "cloudflare-nginx", 5: "nginx/1.12.0", 6: "Apache/2.2.22 (Debian)", 7: "nginx/1.8.0", 8: "nginx/1.10.1", 9: "Microsoft-HTTPAPI/2.0", 10: "Microsoft-IIS/6.0", 11: "Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4"}
68
+
69
+ SERVER = st.selectbox("Select Server", options=list(server_choice.values()))
70
+
71
+ whois_country_choice = {1: "US", 2: "other", 3: "unknown", 4: "PA", 5: "GB", 6: "CN", 7: "KR", 8: "CA", 9: "UK", 10: "CZ", 11: "FR"}
72
+
73
+ WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()))
74
+
75
+ WHOIS_STATEPRO_choice = {1: "other", 2: "CA", 3: "unknown", 4: "California", 5: "PA", 6: "Washington", 7: "Arizona", 8: "ON", 9: "WA", 10: "FL"}
76
+
77
+ WHOIS_STATEPRO = st.selectbox("Select States", options=list(WHOIS_STATEPRO_choice.values()))
78
+
79
+ URL_LENGTH = st.number_input('URL_LENGTH', min_value=16, max_value=159, value=50)
80
+ NUMBER_SPECIAL_CHARACTERS = st.number_input('NUMBER_SPECIAL_CHARACTERS', min_value=5, max_value=28, value=5)
81
+ CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50)
82
+ TCP_CONVERSATION_EXCHANGE = st.number_input('TCP_CONVERSATION_EXCHANGE', min_value=0, max_value=84, value=50)
83
+ DIST_REMOTE_TCP_PORT = st.number_input('DIST_REMOTE_TCP_PORT', min_value=0, max_value=20, value=0)
84
+ REMOTE_IPS = st.number_input('REMOTE_IPS', min_value=0, max_value=16, value=0)
85
+ APP_BYTES = st.number_input('APP_BYTES', min_value=0, max_value=9302, value=50)
86
+ SOURCE_APP_PACKETS = st.number_input('SOURCE_APP_PACKETS', min_value=0, max_value=103, value=50)
87
+ REMOTE_APP_PACKETS = st.number_input('REMOTE_APP_PACKETS', min_value=0, max_value=99, value=50)
88
+ SOURCE_APP_BYTES = st.number_input('SOURCE_APP_BYTES', min_value=0, max_value=38681, value=50)
89
+ REMOTE_APP_BYTES = st.number_input('REMOTE_APP_BYTES', min_value=0, max_value=10693, value=50)
90
+ APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50)
91
+ DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=14, value=5)
92
+
93
+ # input for date
94
+ # 2006-03-22
95
+ WHOIS_REGDATE = st.date_input('WHOIS_REGDATE', format="YYYY-MM-DD")
96
+ WHOIS_UPDATED_DATE = st.date_input('WHOIS_UPDATED_DATE', format="YYYY-MM-DD")
97
+
98
+ #submit buttion
99
+ submitted = st.form_submit_button('Predict')
100
+
101
+ data_inf = {
102
+ 'URL_LENGTH': URL_LENGTH,
103
+ 'NUMBER_SPECIAL_CHARACTERS': NUMBER_SPECIAL_CHARACTERS,
104
+ 'CONTENT_LENGTH': CONTENT_LENGTH,
105
+ 'WHOIS_REGDATE': WHOIS_REGDATE,
106
+ 'WHOIS_UPDATED_DATE': WHOIS_UPDATED_DATE,
107
+ 'TCP_CONVERSATION_EXCHANGE': TCP_CONVERSATION_EXCHANGE,
108
+ 'DIST_REMOTE_TCP_PORT': DIST_REMOTE_TCP_PORT,
109
+ 'REMOTE_IPS': REMOTE_IPS,
110
+ 'APP_BYTES': APP_BYTES,
111
+ 'SOURCE_APP_PACKETS': SOURCE_APP_PACKETS,
112
+ 'REMOTE_APP_PACKETS': REMOTE_APP_PACKETS,
113
+ 'SOURCE_APP_BYTES': SOURCE_APP_BYTES,
114
+ 'REMOTE_APP_BYTES': REMOTE_APP_BYTES,
115
+ 'APP_PACKETS': APP_PACKETS,
116
+ 'DNS_QUERY_TIMES': DNS_QUERY_TIMES,
117
+ 'CHARSET': CHARSET,
118
+ 'SERVER': SERVER,
119
+ 'WHOIS_COUNTRY': WHOIS_COUNTRY,
120
+ 'WHOIS_STATEPRO': WHOIS_STATEPRO
121
+ }
122
+
123
+
124
+
125
+ data_inf = pd.DataFrame([data_inf])
126
+ # st.dataframe(data_inf)
127
+
128
+ def encode_and_create_dataframe_train(df, column):
129
+ # Fit a separate OneHotEncoder for the column
130
+ transformed_data = encoder.fit_transform(df[[column]])
131
+
132
+ # Get feature names for the column
133
+ feature_names = encoder.get_feature_names_out(input_features=[column])
134
+
135
+ # Create a DataFrame for the column
136
+ transformed_df = pd.DataFrame(transformed_data.toarray(),
137
+ index=df.index,
138
+ columns=feature_names)
139
+
140
+ return transformed_df, encoder
141
+
142
+ # logic ketika user submit
143
+ if submitted:
144
+ #split between numerical and categorical columns
145
+ data_inf_num = data_inf[['URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CONTENT_LENGTH',
146
+ 'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE', 'TCP_CONVERSATION_EXCHANGE',
147
+ 'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES', 'SOURCE_APP_PACKETS',
148
+ 'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES', 'APP_PACKETS',
149
+ 'DNS_QUERY_TIMES']]
150
+ data_inf_cat = data_inf[['CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO']]
151
+
152
+ # Convert to datetime format
153
+ data_inf_num['WHOIS_REGDATE'] = pd.to_datetime(data_inf_num['WHOIS_REGDATE'])
154
+ data_inf_num['WHOIS_UPDATED_DATE'] = pd.to_datetime(data_inf_num['WHOIS_UPDATED_DATE'])
155
+
156
+ # Extract year as integer
157
+ data_inf_num['WHOIS_REGDATE'] = data_inf_num['WHOIS_REGDATE'].dt.year
158
+ data_inf_num['WHOIS_UPDATED_DATE'] = data_inf_num['WHOIS_UPDATED_DATE'].dt.year
159
+
160
+ # scaling and encoding
161
+ data_inf_num_scaled = scaler.transform(data_inf_num)
162
+
163
+ # transform to dataframe
164
+ data_inf_num_scaled = pd.DataFrame(data_inf_num_scaled, columns=data_inf_num.columns)
165
+
166
+ capped_CHARSET, ohe_CHARSET = encode_and_create_dataframe_train(data_inf_cat, 'CHARSET')
167
+ capped_SERVER, ohe_SERVER = encode_and_create_dataframe_train(data_inf_cat, 'SERVER')
168
+ capped_WHOIS_COUNTRY, ohe_WHOIS_COUNTRY = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_COUNTRY')
169
+ capped_WHOIS_STATEPRO, ohe_WHOIS_STATEPRO = encode_and_create_dataframe_train(data_inf_cat, 'WHOIS_STATEPRO')
170
+
171
+ # concat all data
172
+ data_inf_final = pd.concat([data_inf_num_scaled, capped_CHARSET, capped_SERVER, capped_WHOIS_COUNTRY, capped_WHOIS_STATEPRO], axis=1)
173
+
174
+ if len(column_names) != len(set(column_names)):
175
+ st.write("column_names contains duplicates")
176
+
177
+ if len(data_inf_final.columns) != len(set(data_inf_final.columns)):
178
+ st.write("data_inf_final has duplicate column names")
179
+
180
+ # reindex to match the training columns
181
+ data_inf_final = data_inf_final.reindex(columns=column_names)
182
+
183
+ # Check Missing Values
184
+ data_inf_final.isnull().sum()
185
+
186
+ # fill null value with zeros
187
+ data_inf_final = data_inf_final.fillna(0)
188
+
189
+ #predict using linear reg model
190
+
191
+ y_pred_inf = model.predict(data_inf_final)
192
+
193
+ st.dataframe(data_inf)
194
+
195
+ if y_pred_inf == 0:
196
+ # write with green color
197
+ st.markdown("<h1 style='text-align: center; color: green;'>Predicted Class: Benign</h1>", unsafe_allow_html=True)
198
+ else:
199
+ st.markdown("<h1 style='text-align: center; color: red;'>Predicted Class: Malicious</h1>", unsafe_allow_html=True)
200
+
201
+
202
+ if __name__ == '__main__':
203
+ app()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ seaborn
4
+ matplotlib
5
+ numpy
6
+ plotly
7
+ Pillow
8
+ xgboost
9
+ scikit-learn==1.2.2
scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:861784cd69b250f457bfbf7b7ef84c51d1afa037fc952e546886c2e703464b0f
3
+ size 1003
web.jpg ADDED