MUD_MLT / app.py
MrUtakata's picture
Create app.py
cb59e97 verified
# app.py
import streamlit as st
import joblib
import pandas as pd
import numpy as np
import re
import string
from urllib.parse import urlparse
st.set_page_config(page_title="Malicious URL Detection", layout="centered")
st.title("πŸ”— Malicious URL Detection")
st.write("Enter a URL below and the model will predict whether it is benign or malicious.")
# 1. Load artifacts
ensemble_model = joblib.load("ensemble_model.joblib")
feature_columns = joblib.load("feature_columns.joblib") # list of feature names
label_index = joblib.load("label_index.joblib") # array of label names
pri_domain_index= joblib.load("pri_domain_index.joblib") # array of allowed domains
# 2. Feature extraction functions (same as training)
def get_url_length(url):
for prefix in ("http://","https://"):
if url.startswith(prefix):
url = url[len(prefix):]
url = url.replace("www.","")
return len(url)
def extract_pri_domain(url):
try:
hostname = urlparse(url).hostname or ""
parts = hostname.split(".")
if len(parts) >= 2:
return ".".join(parts[-2:])
return hostname
except:
return ""
def count_letters(url):
return sum(c.isalpha() for c in url)
def count_digits(url):
return sum(c.isdigit() for c in url)
def count_special_chars(url):
return sum(c in string.punctuation for c in url)
def has_shortening_service(url):
return int(bool(re.search(r"bit\.ly|goo\.gl|shorte\.st|t\.co|tinyurl", url)))
def abnormal_url(url):
net = urlparse(url).netloc
return int(net in url)
def secure_http(url):
return int(urlparse(url).scheme == "https")
def have_ip_address(url):
host = urlparse(url).hostname or ""
return int(bool(re.match(r"^(\d{1,3}\.){3}\d{1,3}$", host)))
def featurize(url: str) -> pd.DataFrame:
"""Build a single-row DataFrame of features for `url`."""
d = {
"url_len": get_url_length(url),
"pri_domain": extract_pri_domain(url),
"letters_count": count_letters(url),
"digits_count": count_digits(url),
"special_chars_count":count_special_chars(url),
"shortened": has_shortening_service(url),
"abnormal_url": abnormal_url(url),
"secure_http": secure_http(url),
"have_ip": have_ip_address(url),
}
df = pd.DataFrame([d])
# map pri_domain β†’ code via your saved index
df["pri_domain"] = pd.Categorical(
df["pri_domain"], categories=pri_domain_index
).codes
# fill any missing
df = df.fillna(0).astype(np.float32)
# reorder columns
return df[feature_columns]
# 3. Streamlit input
url_input = st.text_input("URL", value="https://example.com")
if st.button("Predict"):
if not url_input.strip():
st.error("Please enter a URL.")
else:
# featurize & predict
X_new = featurize(url_input)
pred_idx = ensemble_model.predict(X_new)[0]
probs = ensemble_model.predict_proba(X_new)[0]
# map back to label name
pred_label = label_index[pred_idx]
st.subheader("Prediction")
st.write(f"**{pred_label.upper()}**")
st.subheader("Class probabilities")
# build a tiny DataFrame for display
dfp = pd.DataFrame({
"class": label_index,
"probability": np.round(probs, 4)
}).sort_values("probability", ascending=False)
st.table(dfp)