MrUtakata commited on
Commit
cb59e97
·
verified ·
1 Parent(s): 7e61a79

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ import joblib
4
+ import pandas as pd
5
+ import numpy as np
6
+ import re
7
+ import string
8
+ from urllib.parse import urlparse
9
+
10
+ st.set_page_config(page_title="Malicious URL Detection", layout="centered")
11
+
12
+ st.title("🔗 Malicious URL Detection")
13
+ st.write("Enter a URL below and the model will predict whether it is benign or malicious.")
14
+
15
+ # 1. Load artifacts
16
+ ensemble_model = joblib.load("ensemble_model.joblib")
17
+ feature_columns = joblib.load("feature_columns.joblib") # list of feature names
18
+ label_index = joblib.load("label_index.joblib") # array of label names
19
+ pri_domain_index= joblib.load("pri_domain_index.joblib") # array of allowed domains
20
+
21
+ # 2. Feature extraction functions (same as training)
22
+ def get_url_length(url):
23
+ for prefix in ("http://","https://"):
24
+ if url.startswith(prefix):
25
+ url = url[len(prefix):]
26
+ url = url.replace("www.","")
27
+ return len(url)
28
+
29
+ def extract_pri_domain(url):
30
+ try:
31
+ hostname = urlparse(url).hostname or ""
32
+ parts = hostname.split(".")
33
+ if len(parts) >= 2:
34
+ return ".".join(parts[-2:])
35
+ return hostname
36
+ except:
37
+ return ""
38
+
39
+ def count_letters(url):
40
+ return sum(c.isalpha() for c in url)
41
+
42
+ def count_digits(url):
43
+ return sum(c.isdigit() for c in url)
44
+
45
+ def count_special_chars(url):
46
+ return sum(c in string.punctuation for c in url)
47
+
48
+ def has_shortening_service(url):
49
+ return int(bool(re.search(r"bit\.ly|goo\.gl|shorte\.st|t\.co|tinyurl", url)))
50
+
51
+ def abnormal_url(url):
52
+ net = urlparse(url).netloc
53
+ return int(net in url)
54
+
55
+ def secure_http(url):
56
+ return int(urlparse(url).scheme == "https")
57
+
58
+ def have_ip_address(url):
59
+ host = urlparse(url).hostname or ""
60
+ return int(bool(re.match(r"^(\d{1,3}\.){3}\d{1,3}$", host)))
61
+
62
+ def featurize(url: str) -> pd.DataFrame:
63
+ """Build a single-row DataFrame of features for `url`."""
64
+ d = {
65
+ "url_len": get_url_length(url),
66
+ "pri_domain": extract_pri_domain(url),
67
+ "letters_count": count_letters(url),
68
+ "digits_count": count_digits(url),
69
+ "special_chars_count":count_special_chars(url),
70
+ "shortened": has_shortening_service(url),
71
+ "abnormal_url": abnormal_url(url),
72
+ "secure_http": secure_http(url),
73
+ "have_ip": have_ip_address(url),
74
+ }
75
+ df = pd.DataFrame([d])
76
+ # map pri_domain → code via your saved index
77
+ df["pri_domain"] = pd.Categorical(
78
+ df["pri_domain"], categories=pri_domain_index
79
+ ).codes
80
+ # fill any missing
81
+ df = df.fillna(0).astype(np.float32)
82
+ # reorder columns
83
+ return df[feature_columns]
84
+
85
+ # 3. Streamlit input
86
+ url_input = st.text_input("URL", value="https://example.com")
87
+ if st.button("Predict"):
88
+ if not url_input.strip():
89
+ st.error("Please enter a URL.")
90
+ else:
91
+ # featurize & predict
92
+ X_new = featurize(url_input)
93
+ pred_idx = ensemble_model.predict(X_new)[0]
94
+ probs = ensemble_model.predict_proba(X_new)[0]
95
+
96
+ # map back to label name
97
+ pred_label = label_index[pred_idx]
98
+
99
+ st.subheader("Prediction")
100
+ st.write(f"**{pred_label.upper()}**")
101
+
102
+ st.subheader("Class probabilities")
103
+ # build a tiny DataFrame for display
104
+ dfp = pd.DataFrame({
105
+ "class": label_index,
106
+ "probability": np.round(probs, 4)
107
+ }).sort_values("probability", ascending=False)
108
+
109
+ st.table(dfp)