Spaces:

xyncz
/

malicious-website-detection

Build error

App Files Files Community

xyncz commited on Jan 11, 2024

Commit

6f30b20

verified ·

1 Parent(s): 4c97072

Upload 10 files

Browse files

Files changed (2) hide show

eda.py +26 -0
prediction.py +47 -27

eda.py CHANGED Viewed

@@ -25,6 +25,32 @@ def app():
     df = pd.read_csv('dataset.csv')
     st.dataframe(df)
     object_columns = df.select_dtypes(include=['object']).columns
     numerical_columns = df.select_dtypes(exclude=['object']).columns

     df = pd.read_csv('dataset.csv')
     st.dataframe(df)
+    # **Explanation directly taken from the website:**
+    # writing dataset explanation
+    st.write('#### Dataset Explanation')
+    st.write('''
+    - **URL**: It is the anonymous identification of the URL analyzed in the study.
+    - **URL_LENGTH**: It is the number of characters in the URL.
+    - **NUMBER_SPECIAL_CHARACTERS**: It is the number of special characters identified in the URL, such as, “/”, “%”, “#”, “&”, “. “, “=”.
+    - **CHARSET**: It is a categorical value and its meaning is the character encoding standard (also called character set).
+    - **SERVER**: It is a categorical value and its meaning is the operative system of the server got from the packet response.
+    - **CONTENT_LENGTH**: It represents the content size of the HTTP header.
+    - **WHOIS_COUNTRY**: It is a categorical variable, its values are the countries we got from the server response (specifically, our script used the API of Whois).
+    - **WHOIS_STATEPRO**: It is a categorical variable, its values are the states we got from the server response (specifically, our script used the API of Whois).
+    - **WHOIS_REGDATE**: Whois provides the server registration date, so, this variable has date values with format DD/MM/YYY HH:MM
+    - **WHOIS_UPDATED_DATE**: Through the Whois we got the last update date from the server analyzed.
+    - **TCP_CONVERSATION_EXCHANGE**: This variable is the number of TCP packets exchanged between the server and our honeypot client.
+    - **DIST_REMOTE_TCP_PORT**: It is the number of the ports detected and different to TCP.
+    - **REMOTE_IPS**: This variable has the total number of IPs connected to the honeypot.
+    - **APP_BYTES**: This is the number of bytes transferred.
+    - **SOURCE_APP_PACKETS**: Packets sent from the honeypot to the server.
+    - **REMOTE_APP_PACKETS**: Packets received from the server.
+    - **APP_PACKETS**: This is the total number of IP packets generated during the communication between the honeypot and the server.
+    - **DNS_QUERY_TIMES**: This is the number of DNS packets generated during the communication between the honeypot and the server.
+    - **TYPE**: This is a categorical variable, its values represent the type of web page analyzed, specifically, 1 is for malicious websites and 0 is for benign websites.
+    ''')
     object_columns = df.select_dtypes(include=['object']).columns
     numerical_columns = df.select_dtypes(exclude=['object']).columns

prediction.py CHANGED Viewed

@@ -20,41 +20,61 @@ with open('column_names.pkl', 'rb') as f:
 def app():
     with st.form('from_website_data'):
         charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"}
-        CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()))
         server_choice = {1: 'Apache', 2: 'cloudflare-nginx', 3: 'other', 4: 'Server', 5: 'GSE', 6: 'nginx', 7: 'unknown', 8: 'Microsoft-HTTPAPI/2.0', 9: 'nginx/1.8.0', 10: 'nginx/1.10.1', 11: 'Microsoft-IIS/7.5', 12: 'YouTubeFrontEnd', 13: 'Apache/2.2.22 (Debian)', 14: 'nginx/1.12.0', 15: 'Microsoft-IIS/6.0', 16: 'Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4', 17: 'Apache/2.2.14 (FreeBSD) mod_ssl/2.2.14 OpenSSL/0.9.8y DAV/2 PHP/5.2.12 with Suhosin-Patch'}
-        SERVER = st.selectbox("Select Server", options=list(server_choice.values()))
         whois_country_choice = {1: "AU", 2: "CA", 3: "ES", 4: "US", 5: "other", 6: "unknown", 7: "PA", 8: "FR", 9: "KR", 10: "CZ", 11: "JP", 12: "ru", 13: "UK", 14: "CN", 15: "GB", 16: "UY"}
-        WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()))
         WHOIS_STATEPRO_choice = {1: "other", 2: "Barcelona", 3: "CA", 4: "NV", 5: "Washington", 6: "unknown", 7: "Arizona", 8: "UT", 9: "NY", 10: "ON", 11: "PA", 12: "FL", 13: "California", 14: "PRAHA", 15: "WA", 16: "Krasnoyarsk", 17: "Utah", 18: "WC1N"}
-        WHOIS_STATEPRO = st.selectbox("Select States", options=list(WHOIS_STATEPRO_choice.values()))
-        URL_LENGTH = st.number_input('URL_LENGTH', min_value=16, max_value=159, value=50)
-        NUMBER_SPECIAL_CHARACTERS = st.number_input('NUMBER_SPECIAL_CHARACTERS', min_value=5, max_value=28, value=5)
-        CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50)
-        TCP_CONVERSATION_EXCHANGE = st.number_input('TCP_CONVERSATION_EXCHANGE', min_value=0, max_value=84, value=50)
-        DIST_REMOTE_TCP_PORT = st.number_input('DIST_REMOTE_TCP_PORT', min_value=0, max_value=20, value=0)
-        REMOTE_IPS = st.number_input('REMOTE_IPS', min_value=0, max_value=16, value=0)
-        APP_BYTES = st.number_input('APP_BYTES', min_value=0, max_value=9302, value=50)
-        SOURCE_APP_PACKETS = st.number_input('SOURCE_APP_PACKETS', min_value=0, max_value=103, value=50)
-        REMOTE_APP_PACKETS = st.number_input('REMOTE_APP_PACKETS', min_value=0, max_value=99, value=50)
-        SOURCE_APP_BYTES = st.number_input('SOURCE_APP_BYTES', min_value=0, max_value=38681, value=50)
-        REMOTE_APP_BYTES = st.number_input('REMOTE_APP_BYTES', min_value=0, max_value=10693, value=50)
-        APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50)
-        DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=14, value=5)
-        # input for date
-        # 2006-03-22
-        WHOIS_REGDATE = st.date_input('WHOIS_REGDATE', format="YYYY-MM-DD")
-        WHOIS_UPDATED_DATE = st.date_input('WHOIS_UPDATED_DATE', format="YYYY-MM-DD")
         #submit buttion
         submitted = st.form_submit_button('Predict')

 def app():
     with st.form('from_website_data'):
+        # write short description about the model
+        st.write('''
+        # **Malicious or Benign Website Detection**
+        - The model used for this detection is `XGBoost` Classifier which has Hyperparameter Tuned.
+        - The model also used `SMOTENC` to handle imbalanced data during training.
+        - This model achieved `0.93` recall score on the test set to detect malicious website.
+        ''')
+        url = st.text_input('URL', 'https://www.google.com', help='The URL that will be analyzed')
+        URL_LENGTH = len(url)
+        NUMBER_SPECIAL_CHARACTERS = sum(not x.isalnum() for x in url)
         charset_choice = {1: "ISO-8859-1", 2: "UTF-8", 3: "utf-8", 4: "us-ascii", 5: "iso-8859-1", 6: "unknown", 7: "windows-1252", 8: "windows-1251"}
         server_choice = {1: 'Apache', 2: 'cloudflare-nginx', 3: 'other', 4: 'Server', 5: 'GSE', 6: 'nginx', 7: 'unknown', 8: 'Microsoft-HTTPAPI/2.0', 9: 'nginx/1.8.0', 10: 'nginx/1.10.1', 11: 'Microsoft-IIS/7.5', 12: 'YouTubeFrontEnd', 13: 'Apache/2.2.22 (Debian)', 14: 'nginx/1.12.0', 15: 'Microsoft-IIS/6.0', 16: 'Apache/2.4.23 (Unix) OpenSSL/1.0.1e-fips mod_bwlimited/1.4', 17: 'Apache/2.2.14 (FreeBSD) mod_ssl/2.2.14 OpenSSL/0.9.8y DAV/2 PHP/5.2.12 with Suhosin-Patch'}
         whois_country_choice = {1: "AU", 2: "CA", 3: "ES", 4: "US", 5: "other", 6: "unknown", 7: "PA", 8: "FR", 9: "KR", 10: "CZ", 11: "JP", 12: "ru", 13: "UK", 14: "CN", 15: "GB", 16: "UY"}
         WHOIS_STATEPRO_choice = {1: "other", 2: "Barcelona", 3: "CA", 4: "NV", 5: "Washington", 6: "unknown", 7: "Arizona", 8: "UT", 9: "NY", 10: "ON", 11: "PA", 12: "FL", 13: "California", 14: "PRAHA", 15: "WA", 16: "Krasnoyarsk", 17: "Utah", 18: "WC1N"}
+        CHARSET = st.selectbox("Select Charset", options=list(charset_choice.values()), help='The character encoding standard (also called character set)')
+        SERVER = st.selectbox("Select Server", options=list(server_choice.values()), help='The operative system of the server got from the packet response')
+        CONTENT_LENGTH = st.number_input('CONTENT_LENGTH', min_value=0, max_value=9806, value=50, help='The content size of the HTTP header')
+        WHOIS_COUNTRY = st.selectbox("Select Country", options=list(whois_country_choice.values()), help='The countries we got from the server response (specifically, our script used the API of Whois)')
+        WHOIS_STATEPRO = st.selectbox("Select States", options=list(WHOIS_STATEPRO_choice.values()), help='The states we got from the server response (specifically, our script used the API of Whois)')
+        WHOIS_REGDATE = st.date_input('WHOIS_REGDATE', help='Whois provides the server registration date')
+        WHOIS_UPDATED_DATE = st.date_input('WHOIS_UPDATED_DATE', help='Through the Whois we got the last update date from the server analyzed')
+        TCP_CONVERSATION_EXCHANGE = st.number_input('TCP_CONVERSATION_EXCHANGE', min_value=0, max_value=84, value=50, help='This variable is the number of TCP packets exchanged between the server and our honeypot client')
+        DIST_REMOTE_TCP_PORT = st.number_input('DIST_REMOTE_TCP_PORT', min_value=0, max_value=20, value=0, help='It is the number of the ports detected and different to TCP')
+        REMOTE_IPS = st.number_input('REMOTE_IPS', min_value=0, max_value=16, value=0, help='This variable has the total number of IPs connected to the honeypot')
+        APP_BYTES = st.number_input('APP_BYTES', min_value=0, max_value=9302, value=50, help='This is the number of bytes transferred')
+        REMOTE_APP_BYTES = st.number_input('REMOTE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes received from the server')
+        SOURCE_APP_BYTES = st.number_input('SOURCE_APP_BYTES', min_value=0, max_value=100000, value=0, help='This is the number of bytes sent to the server')
+        SOURCE_APP_PACKETS = st.number_input('SOURCE_APP_PACKETS', min_value=0, max_value=103, value=50, help='Packets sent from the honeypot to the server')
+        REMOTE_APP_PACKETS = st.number_input('REMOTE_APP_PACKETS', min_value=0, max_value=99, value=50, help='Packets received from the server')
+        APP_PACKETS = st.number_input('APP_PACKETS', min_value=0, max_value=103, value=50, help='This is the total number of IP packets generated during the communication between the honeypot and the server')
+        DNS_QUERY_TIMES = st.number_input('DNS_QUERY_TIMES', min_value=0, max_value=20, value=0, help='This is the number of DNS packets generated during the communication between the honeypot and the server')
         #submit buttion
         submitted = st.form_submit_button('Predict')