Spaces:
Build error
Build error
Upload 10 files
Browse files- eda.py +26 -0
- prediction.py +47 -27
eda.py
CHANGED
|
@@ -25,6 +25,32 @@ def app():
|
|
| 25 |
df = pd.read_csv('dataset.csv')
|
| 26 |
st.dataframe(df)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
object_columns = df.select_dtypes(include=['object']).columns
|
| 29 |
numerical_columns = df.select_dtypes(exclude=['object']).columns
|
| 30 |
|
|
|
|
| 25 |
df = pd.read_csv('dataset.csv')
|
| 26 |
st.dataframe(df)
|
| 27 |
|
| 28 |
+
# **Explanation directly taken from the website:**
|
| 29 |
+
|
| 30 |
+
# writing dataset explanation
|
| 31 |
+
st.write('#### Dataset Explanation')
|
| 32 |
+
st.write('''
|
| 33 |
+
- **URL**: It is the anonymous identification of the URL analyzed in the study.
|
| 34 |
+
- **URL_LENGTH**: It is the number of characters in the URL.
|
| 35 |
+
- **NUMBER_SPECIAL_CHARACTERS**: It is the number of special characters identified in the URL, such as, β/β, β%β, β#β, β&β, β. β, β=β.
|
| 36 |
+
- **CHARSET**: It is a categorical value and its meaning is the character encoding standard (also called character set).
|
| 37 |
+
- **SERVER**: It is a categorical value and its meaning is the operative system of the server got from the packet response.
|
| 38 |
+
- **CONTENT_LENGTH**: It represents the content size of the HTTP header.
|
| 39 |
+
- **WHOIS_COUNTRY**: It is a categorical variable, its values are the countries we got from the server response (specifically, our script used the API of Whois).
|
| 40 |
+
- **WHOIS_STATEPRO**: It is a categorical variable, its values are the states we got from the server response (specifically, our script used the API of Whois).
|
| 41 |
+
- **WHOIS_REGDATE**: Whois provides the server registration date, so, this variable has date values with format DD/MM/YYY HH:MM
|
| 42 |
+
- **WHOIS_UPDATED_DATE**: Through the Whois we got the last update date from the server analyzed.
|
| 43 |
+
- **TCP_CONVERSATION_EXCHANGE**: This variable is the number of TCP packets exchanged between the server and our honeypot client.
|
| 44 |
+
- **DIST_REMOTE_TCP_PORT**: It is the number of the ports detected and different to TCP.
|
| 45 |
+
- **REMOTE_IPS**: This variable has the total number of IPs connected to the honeypot.
|
| 46 |
+
- **APP_BYTES**: This is the number of bytes transferred.
|
| 47 |
+
- **SOURCE_APP_PACKETS**: Packets sent from the honeypot to the server.
|
| 48 |
+
- **REMOTE_APP_PACKETS**: Packets received from the server.
|
| 49 |
+
- **APP_PACKETS**: This is the total number of IP packets generated during the communication between the honeypot and the server.
|
| 50 |
+
- **DNS_QUERY_TIMES**: This is the number of DNS packets generated during the communication between the honeypot and the server.
|
| 51 |
+
- **TYPE**: This is a categorical variable, its values represent the type of web page analyzed, specifically, 1 is for malicious websites and 0 is for benign websites.
|
| 52 |
+
''')
|
| 53 |
+
|
| 54 |
object_columns = df.select_dtypes(include=['object']).columns
|
| 55 |
numerical_columns = df.select_dtypes(exclude=['object']).columns
|
| 56 |
|
prediction.py
CHANGED
|
@@ -20,41 +20,61 @@ with open('column_names.pkl', 'rb') as f:
|
|
| 20 |
def app():
|
| 21 |
|
| 22 |
with st.form('from_website_data'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"}
|
| 25 |
-
|
| 26 |
-
CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()))
|
| 27 |
-
|
| 28 |
server_choice = {1: 'Apache', 2: 'cloudflare-nginx', 3: 'other', 4: 'Server', 5: 'GSE', 6: 'nginx', 7: 'unknown', 8: 'Microsoft-HTTPAPI/2.0', 9: 'nginx/1.8.0', 10: 'nginx/1.10.1', 11: 'Microsoft-IIS/7.5', 12: 'YouTubeFrontEnd', 13: 'Apache/2.2.22 (Debian)', 14: 'nginx/1.12.0', 15: 'Microsoft-IIS/6.0', 16: 'Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4', 17: 'Apache/2.2.14 (FreeBSD) mod_ssl/2.2.14 OpenSSL/0.9.8y DAV/2 PHP/5.2.12 with Suhosin-Patch'}
|
| 29 |
-
|
| 30 |
-
SERVER = st.selectbox("Select Server", options=list(server_choice.values()))
|
| 31 |
|
| 32 |
whois_country_choice = {1: "AU", 2: "CA", 3: "ES", 4: "US", 5: "other", 6: "unknown", 7: "PA", 8: "FR", 9: "KR", 10: "CZ", 11: "JP", 12: "ru", 13: "UK", 14: "CN", 15: "GB", 16: "UY"}
|
| 33 |
-
|
| 34 |
-
WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()))
|
| 35 |
-
|
| 36 |
WHOIS_STATEPRO_choice = {1: "other", 2: "Barcelona", 3: "CA", 4: "NV", 5: "Washington", 6: "unknown", 7: "Arizona", 8: "UT", 9: "NY", 10: "ON", 11: "PA", 12: "FL", 13: "California", 14: "PRAHA", 15: "WA", 16: "Krasnoyarsk", 17: "Utah", 18: "WC1N"}
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50)
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50)
|
| 52 |
-
DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=14, value=5)
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
#submit buttion
|
| 60 |
submitted = st.form_submit_button('Predict')
|
|
|
|
| 20 |
def app():
|
| 21 |
|
| 22 |
with st.form('from_website_data'):
|
| 23 |
+
# write short description about the model
|
| 24 |
+
st.write('''
|
| 25 |
+
# **Malicious or Benign Website Detection**
|
| 26 |
+
- The model used for this detection is `XGBoost` Classifier which has Hyperparameter Tuned.
|
| 27 |
+
- The model also used `SMOTENC` to handle imbalanced data during training.
|
| 28 |
+
- This model achieved `0.93` recall score on the test set to detect malicious website.
|
| 29 |
+
''')
|
| 30 |
|
| 31 |
+
url = st.text_input('URL', 'https://www.google.com', help='The URL that will be analyzed')
|
| 32 |
+
|
| 33 |
+
URL_LENGTH = len(url)
|
| 34 |
+
|
| 35 |
+
NUMBER_SPECIAL_CHARACTERS = sum(not x.isalnum() for x in url)
|
| 36 |
+
|
| 37 |
charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"}
|
| 38 |
+
|
|
|
|
|
|
|
| 39 |
server_choice = {1: 'Apache', 2: 'cloudflare-nginx', 3: 'other', 4: 'Server', 5: 'GSE', 6: 'nginx', 7: 'unknown', 8: 'Microsoft-HTTPAPI/2.0', 9: 'nginx/1.8.0', 10: 'nginx/1.10.1', 11: 'Microsoft-IIS/7.5', 12: 'YouTubeFrontEnd', 13: 'Apache/2.2.22 (Debian)', 14: 'nginx/1.12.0', 15: 'Microsoft-IIS/6.0', 16: 'Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4', 17: 'Apache/2.2.14 (FreeBSD) mod_ssl/2.2.14 OpenSSL/0.9.8y DAV/2 PHP/5.2.12 with Suhosin-Patch'}
|
|
|
|
|
|
|
| 40 |
|
| 41 |
whois_country_choice = {1: "AU", 2: "CA", 3: "ES", 4: "US", 5: "other", 6: "unknown", 7: "PA", 8: "FR", 9: "KR", 10: "CZ", 11: "JP", 12: "ru", 13: "UK", 14: "CN", 15: "GB", 16: "UY"}
|
| 42 |
+
|
|
|
|
|
|
|
| 43 |
WHOIS_STATEPRO_choice = {1: "other", 2: "Barcelona", 3: "CA", 4: "NV", 5: "Washington", 6: "unknown", 7: "Arizona", 8: "UT", 9: "NY", 10: "ON", 11: "PA", 12: "FL", 13: "California", 14: "PRAHA", 15: "WA", 16: "Krasnoyarsk", 17: "Utah", 18: "WC1N"}
|
| 44 |
|
| 45 |
+
CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()), help='The character encoding standard (also called character set)')
|
| 46 |
+
|
| 47 |
+
SERVER = st.selectbox("Select Server", options=list(server_choice.values()), help='The operative system of the server got from the packet response')
|
| 48 |
+
|
| 49 |
+
CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50, help='The content size of the HTTP header')
|
| 50 |
+
|
| 51 |
+
WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()), help='The countries we got from the server response (specifically, our script used the API of Whois)')
|
| 52 |
+
|
| 53 |
+
WHOIS_STATEPRO = st.selectbox("Select States", options=list(WHOIS_STATEPRO_choice.values()), help='The states we got from the server response (specifically, our script used the API of Whois)')
|
| 54 |
+
|
| 55 |
+
WHOIS_REGDATE = st.date_input('WHOIS_REGDATE', help='Whois provides the server registration date')
|
| 56 |
+
|
| 57 |
+
WHOIS_UPDATED_DATE = st.date_input('WHOIS_UPDATED_DATE', help='Through the Whois we got the last update date from the server analyzed')
|
|
|
|
|
|
|
| 58 |
|
| 59 |
+
TCP_CONVERSATION_EXCHANGE = st.number_input('TCP_CONVERSATION_EXCHANGE', min_value=0, max_value=84, value=50, help='This variable is the number of TCP packets exchanged between the server and our honeypot client')
|
| 60 |
+
|
| 61 |
+
DIST_REMOTE_TCP_PORT = st.number_input('DIST_REMOTE_TCP_PORT', min_value=0, max_value=20, value=0, help='It is the number of the ports detected and different to TCP')
|
| 62 |
+
|
| 63 |
+
REMOTE_IPS = st.number_input('REMOTE_IPS', min_value=0, max_value=16, value=0, help='This variable has the total number of IPs connected to the honeypot')
|
| 64 |
+
|
| 65 |
+
APP_BYTES = st.number_input('APP_BYTES', min_value=0, max_value=9302, value=50, help='This is the number of bytes transferred')
|
| 66 |
+
|
| 67 |
+
REMOTE_APP_BYTES = st.number_input('REMOTE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes received from the server')
|
| 68 |
+
|
| 69 |
+
SOURCE_APP_BYTES = st.number_input('SOURCE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes sent to the server')
|
| 70 |
+
|
| 71 |
+
SOURCE_APP_PACKETS = st.number_input('SOURCE_APP_PACKETS', min_value=0, max_value=103, value=50, help='Packets sent from the honeypot to the server')
|
| 72 |
+
|
| 73 |
+
REMOTE_APP_PACKETS = st.number_input('REMOTE_APP_PACKETS', min_value=0, max_value=99, value=50, help='Packets received from the server')
|
| 74 |
+
|
| 75 |
+
APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50, help='This is the total number of IP packets generated during the communication between the honeypot and the server')
|
| 76 |
+
|
| 77 |
+
DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=20, value=0, help='This is the number of DNS packets generated during the communication between the honeypot and the server')
|
| 78 |
|
| 79 |
#submit buttion
|
| 80 |
submitted = st.form_submit_button('Predict')
|