|
|
|
|
|
import streamlit as st |
|
|
import joblib |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import re |
|
|
import string |
|
|
from urllib.parse import urlparse |
|
|
|
|
|
st.set_page_config(page_title="Malicious URL Detection", layout="centered") |
|
|
|
|
|
st.title("π Malicious URL Detection") |
|
|
st.write("Enter a URL below and the model will predict whether it is benign or malicious.") |
|
|
|
|
|
|
|
|
ensemble_model = joblib.load("ensemble_model.joblib") |
|
|
feature_columns = joblib.load("feature_columns.joblib") |
|
|
label_index = joblib.load("label_index.joblib") |
|
|
pri_domain_index= joblib.load("pri_domain_index.joblib") |
|
|
|
|
|
|
|
|
def get_url_length(url): |
|
|
for prefix in ("http://","https://"): |
|
|
if url.startswith(prefix): |
|
|
url = url[len(prefix):] |
|
|
url = url.replace("www.","") |
|
|
return len(url) |
|
|
|
|
|
def extract_pri_domain(url): |
|
|
try: |
|
|
hostname = urlparse(url).hostname or "" |
|
|
parts = hostname.split(".") |
|
|
if len(parts) >= 2: |
|
|
return ".".join(parts[-2:]) |
|
|
return hostname |
|
|
except: |
|
|
return "" |
|
|
|
|
|
def count_letters(url): |
|
|
return sum(c.isalpha() for c in url) |
|
|
|
|
|
def count_digits(url): |
|
|
return sum(c.isdigit() for c in url) |
|
|
|
|
|
def count_special_chars(url): |
|
|
return sum(c in string.punctuation for c in url) |
|
|
|
|
|
def has_shortening_service(url): |
|
|
return int(bool(re.search(r"bit\.ly|goo\.gl|shorte\.st|t\.co|tinyurl", url))) |
|
|
|
|
|
def abnormal_url(url): |
|
|
net = urlparse(url).netloc |
|
|
return int(net in url) |
|
|
|
|
|
def secure_http(url): |
|
|
return int(urlparse(url).scheme == "https") |
|
|
|
|
|
def have_ip_address(url): |
|
|
host = urlparse(url).hostname or "" |
|
|
return int(bool(re.match(r"^(\d{1,3}\.){3}\d{1,3}$", host))) |
|
|
|
|
|
def featurize(url: str) -> pd.DataFrame: |
|
|
"""Build a single-row DataFrame of features for `url`.""" |
|
|
d = { |
|
|
"url_len": get_url_length(url), |
|
|
"pri_domain": extract_pri_domain(url), |
|
|
"letters_count": count_letters(url), |
|
|
"digits_count": count_digits(url), |
|
|
"special_chars_count":count_special_chars(url), |
|
|
"shortened": has_shortening_service(url), |
|
|
"abnormal_url": abnormal_url(url), |
|
|
"secure_http": secure_http(url), |
|
|
"have_ip": have_ip_address(url), |
|
|
} |
|
|
df = pd.DataFrame([d]) |
|
|
|
|
|
df["pri_domain"] = pd.Categorical( |
|
|
df["pri_domain"], categories=pri_domain_index |
|
|
).codes |
|
|
|
|
|
df = df.fillna(0).astype(np.float32) |
|
|
|
|
|
return df[feature_columns] |
|
|
|
|
|
|
|
|
url_input = st.text_input("URL", value="https://example.com") |
|
|
if st.button("Predict"): |
|
|
if not url_input.strip(): |
|
|
st.error("Please enter a URL.") |
|
|
else: |
|
|
|
|
|
X_new = featurize(url_input) |
|
|
pred_idx = ensemble_model.predict(X_new)[0] |
|
|
probs = ensemble_model.predict_proba(X_new)[0] |
|
|
|
|
|
|
|
|
pred_label = label_index[pred_idx] |
|
|
|
|
|
st.subheader("Prediction") |
|
|
st.write(f"**{pred_label.upper()}**") |
|
|
|
|
|
st.subheader("Class probabilities") |
|
|
|
|
|
dfp = pd.DataFrame({ |
|
|
"class": label_index, |
|
|
"probability": np.round(probs, 4) |
|
|
}).sort_values("probability", ascending=False) |
|
|
|
|
|
st.table(dfp) |
|
|
|