xyncz commited on
Commit
6f30b20
Β·
verified Β·
1 Parent(s): 4c97072

Upload 10 files

Browse files
Files changed (2) hide show
  1. eda.py +26 -0
  2. prediction.py +47 -27
eda.py CHANGED
@@ -25,6 +25,32 @@ def app():
25
  df = pd.read_csv('dataset.csv')
26
  st.dataframe(df)
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  object_columns = df.select_dtypes(include=['object']).columns
29
  numerical_columns = df.select_dtypes(exclude=['object']).columns
30
 
 
25
  df = pd.read_csv('dataset.csv')
26
  st.dataframe(df)
27
 
28
+ # **Explanation directly taken from the website:**
29
+
30
+ # writing dataset explanation
31
+ st.write('#### Dataset Explanation')
32
+ st.write('''
33
+ - **URL**: It is the anonymous identification of the URL analyzed in the study.
34
+ - **URL_LENGTH**: It is the number of characters in the URL.
35
+ - **NUMBER_SPECIAL_CHARACTERS**: It is the number of special characters identified in the URL, such as, β€œ/”, β€œ%”, β€œ#”, β€œ&”, β€œ. β€œ, β€œ=”.
36
+ - **CHARSET**: It is a categorical value and its meaning is the character encoding standard (also called character set).
37
+ - **SERVER**: It is a categorical value and its meaning is the operative system of the server got from the packet response.
38
+ - **CONTENT_LENGTH**: It represents the content size of the HTTP header.
39
+ - **WHOIS_COUNTRY**: It is a categorical variable, its values are the countries we got from the server response (specifically, our script used the API of Whois).
40
+ - **WHOIS_STATEPRO**: It is a categorical variable, its values are the states we got from the server response (specifically, our script used the API of Whois).
41
+ - **WHOIS_REGDATE**: Whois provides the server registration date, so, this variable has date values with format DD/MM/YYY HH:MM
42
+ - **WHOIS_UPDATED_DATE**: Through the Whois we got the last update date from the server analyzed.
43
+ - **TCP_CONVERSATION_EXCHANGE**: This variable is the number of TCP packets exchanged between the server and our honeypot client.
44
+ - **DIST_REMOTE_TCP_PORT**: It is the number of the ports detected and different to TCP.
45
+ - **REMOTE_IPS**: This variable has the total number of IPs connected to the honeypot.
46
+ - **APP_BYTES**: This is the number of bytes transferred.
47
+ - **SOURCE_APP_PACKETS**: Packets sent from the honeypot to the server.
48
+ - **REMOTE_APP_PACKETS**: Packets received from the server.
49
+ - **APP_PACKETS**: This is the total number of IP packets generated during the communication between the honeypot and the server.
50
+ - **DNS_QUERY_TIMES**: This is the number of DNS packets generated during the communication between the honeypot and the server.
51
+ - **TYPE**: This is a categorical variable, its values represent the type of web page analyzed, specifically, 1 is for malicious websites and 0 is for benign websites.
52
+ ''')
53
+
54
  object_columns = df.select_dtypes(include=['object']).columns
55
  numerical_columns = df.select_dtypes(exclude=['object']).columns
56
 
prediction.py CHANGED
@@ -20,41 +20,61 @@ with open('column_names.pkl', 'rb') as f:
20
  def app():
21
 
22
  with st.form('from_website_data'):
 
 
 
 
 
 
 
23
 
 
 
 
 
 
 
24
  charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"}
25
-
26
- CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()))
27
-
28
  server_choice = {1: 'Apache', 2: 'cloudflare-nginx', 3: 'other', 4: 'Server', 5: 'GSE', 6: 'nginx', 7: 'unknown', 8: 'Microsoft-HTTPAPI/2.0', 9: 'nginx/1.8.0', 10: 'nginx/1.10.1', 11: 'Microsoft-IIS/7.5', 12: 'YouTubeFrontEnd', 13: 'Apache/2.2.22 (Debian)', 14: 'nginx/1.12.0', 15: 'Microsoft-IIS/6.0', 16: 'Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4', 17: 'Apache/2.2.14 (FreeBSD) mod_ssl/2.2.14 OpenSSL/0.9.8y DAV/2 PHP/5.2.12 with Suhosin-Patch'}
29
-
30
- SERVER = st.selectbox("Select Server", options=list(server_choice.values()))
31
 
32
  whois_country_choice = {1: "AU", 2: "CA", 3: "ES", 4: "US", 5: "other", 6: "unknown", 7: "PA", 8: "FR", 9: "KR", 10: "CZ", 11: "JP", 12: "ru", 13: "UK", 14: "CN", 15: "GB", 16: "UY"}
33
-
34
- WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()))
35
-
36
  WHOIS_STATEPRO_choice = {1: "other", 2: "Barcelona", 3: "CA", 4: "NV", 5: "Washington", 6: "unknown", 7: "Arizona", 8: "UT", 9: "NY", 10: "ON", 11: "PA", 12: "FL", 13: "California", 14: "PRAHA", 15: "WA", 16: "Krasnoyarsk", 17: "Utah", 18: "WC1N"}
37
 
38
- WHOIS_STATEPRO = st.selectbox("Select States", options=list(WHOIS_STATEPRO_choice.values()))
39
-
40
- URL_LENGTH = st.number_input('URL_LENGTH', min_value=16, max_value=159, value=50)
41
- NUMBER_SPECIAL_CHARACTERS = st.number_input('NUMBER_SPECIAL_CHARACTERS', min_value=5, max_value=28, value=5)
42
- CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50)
43
- TCP_CONVERSATION_EXCHANGE = st.number_input('TCP_CONVERSATION_EXCHANGE', min_value=0, max_value=84, value=50)
44
- DIST_REMOTE_TCP_PORT = st.number_input('DIST_REMOTE_TCP_PORT', min_value=0, max_value=20, value=0)
45
- REMOTE_IPS = st.number_input('REMOTE_IPS', min_value=0, max_value=16, value=0)
46
- APP_BYTES = st.number_input('APP_BYTES', min_value=0, max_value=9302, value=50)
47
- SOURCE_APP_PACKETS = st.number_input('SOURCE_APP_PACKETS', min_value=0, max_value=103, value=50)
48
- REMOTE_APP_PACKETS = st.number_input('REMOTE_APP_PACKETS', min_value=0, max_value=99, value=50)
49
- SOURCE_APP_BYTES = st.number_input('SOURCE_APP_BYTES', min_value=0, max_value=38681, value=50)
50
- REMOTE_APP_BYTES = st.number_input('REMOTE_APP_BYTES', min_value=0, max_value=10693, value=50)
51
- APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50)
52
- DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=14, value=5)
53
 
54
- # input for date
55
- # 2006-03-22
56
- WHOIS_REGDATE = st.date_input('WHOIS_REGDATE', format="YYYY-MM-DD")
57
- WHOIS_UPDATED_DATE = st.date_input('WHOIS_UPDATED_DATE', format="YYYY-MM-DD")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  #submit buttion
60
  submitted = st.form_submit_button('Predict')
 
20
  def app():
21
 
22
  with st.form('from_website_data'):
23
+ # write short description about the model
24
+ st.write('''
25
+ # **Malicious or Benign Website Detection**
26
+ - The model used for this detection is `XGBoost` Classifier which has Hyperparameter Tuned.
27
+ - The model also used `SMOTENC` to handle imbalanced data during training.
28
+ - This model achieved `0.93` recall score on the test set to detect malicious website.
29
+ ''')
30
 
31
+ url = st.text_input('URL', 'https://www.google.com', help='The URL that will be analyzed')
32
+
33
+ URL_LENGTH = len(url)
34
+
35
+ NUMBER_SPECIAL_CHARACTERS = sum(not x.isalnum() for x in url)
36
+
37
  charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"}
38
+
 
 
39
  server_choice = {1: 'Apache', 2: 'cloudflare-nginx', 3: 'other', 4: 'Server', 5: 'GSE', 6: 'nginx', 7: 'unknown', 8: 'Microsoft-HTTPAPI/2.0', 9: 'nginx/1.8.0', 10: 'nginx/1.10.1', 11: 'Microsoft-IIS/7.5', 12: 'YouTubeFrontEnd', 13: 'Apache/2.2.22 (Debian)', 14: 'nginx/1.12.0', 15: 'Microsoft-IIS/6.0', 16: 'Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4', 17: 'Apache/2.2.14 (FreeBSD) mod_ssl/2.2.14 OpenSSL/0.9.8y DAV/2 PHP/5.2.12 with Suhosin-Patch'}
 
 
40
 
41
  whois_country_choice = {1: "AU", 2: "CA", 3: "ES", 4: "US", 5: "other", 6: "unknown", 7: "PA", 8: "FR", 9: "KR", 10: "CZ", 11: "JP", 12: "ru", 13: "UK", 14: "CN", 15: "GB", 16: "UY"}
42
+
 
 
43
  WHOIS_STATEPRO_choice = {1: "other", 2: "Barcelona", 3: "CA", 4: "NV", 5: "Washington", 6: "unknown", 7: "Arizona", 8: "UT", 9: "NY", 10: "ON", 11: "PA", 12: "FL", 13: "California", 14: "PRAHA", 15: "WA", 16: "Krasnoyarsk", 17: "Utah", 18: "WC1N"}
44
 
45
+ CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()), help='The character encoding standard (also called character set)')
46
+
47
+ SERVER = st.selectbox("Select Server", options=list(server_choice.values()), help='The operative system of the server got from the packet response')
48
+
49
+ CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50, help='The content size of the HTTP header')
50
+
51
+ WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()), help='The countries we got from the server response (specifically, our script used the API of Whois)')
52
+
53
+ WHOIS_STATEPRO = st.selectbox("Select States", options=list(WHOIS_STATEPRO_choice.values()), help='The states we got from the server response (specifically, our script used the API of Whois)')
54
+
55
+ WHOIS_REGDATE = st.date_input('WHOIS_REGDATE', help='Whois provides the server registration date')
56
+
57
+ WHOIS_UPDATED_DATE = st.date_input('WHOIS_UPDATED_DATE', help='Through the Whois we got the last update date from the server analyzed')
 
 
58
 
59
+ TCP_CONVERSATION_EXCHANGE = st.number_input('TCP_CONVERSATION_EXCHANGE', min_value=0, max_value=84, value=50, help='This variable is the number of TCP packets exchanged between the server and our honeypot client')
60
+
61
+ DIST_REMOTE_TCP_PORT = st.number_input('DIST_REMOTE_TCP_PORT', min_value=0, max_value=20, value=0, help='It is the number of the ports detected and different to TCP')
62
+
63
+ REMOTE_IPS = st.number_input('REMOTE_IPS', min_value=0, max_value=16, value=0, help='This variable has the total number of IPs connected to the honeypot')
64
+
65
+ APP_BYTES = st.number_input('APP_BYTES', min_value=0, max_value=9302, value=50, help='This is the number of bytes transferred')
66
+
67
+ REMOTE_APP_BYTES = st.number_input('REMOTE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes received from the server')
68
+
69
+ SOURCE_APP_BYTES = st.number_input('SOURCE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes sent to the server')
70
+
71
+ SOURCE_APP_PACKETS = st.number_input('SOURCE_APP_PACKETS', min_value=0, max_value=103, value=50, help='Packets sent from the honeypot to the server')
72
+
73
+ REMOTE_APP_PACKETS = st.number_input('REMOTE_APP_PACKETS', min_value=0, max_value=99, value=50, help='Packets received from the server')
74
+
75
+ APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50, help='This is the total number of IP packets generated during the communication between the honeypot and the server')
76
+
77
+ DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=20, value=0, help='This is the number of DNS packets generated during the communication between the honeypot and the server')
78
 
79
  #submit buttion
80
  submitted = st.form_submit_button('Predict')