# app.py import streamlit as st import joblib import pandas as pd import numpy as np import re import string from urllib.parse import urlparse st.set_page_config(page_title="Malicious URL Detection", layout="centered") st.title("🔗 Malicious URL Detection") st.write("Enter a URL below and the model will predict whether it is benign or malicious.") # 1. Load artifacts ensemble_model = joblib.load("ensemble_model.joblib") feature_columns = joblib.load("feature_columns.joblib") # list of feature names label_index = joblib.load("label_index.joblib") # array of label names pri_domain_index= joblib.load("pri_domain_index.joblib") # array of allowed domains # 2. Feature extraction functions (same as training) def get_url_length(url): for prefix in ("http://","https://"): if url.startswith(prefix): url = url[len(prefix):] url = url.replace("www.","") return len(url) def extract_pri_domain(url): try: hostname = urlparse(url).hostname or "" parts = hostname.split(".") if len(parts) >= 2: return ".".join(parts[-2:]) return hostname except: return "" def count_letters(url): return sum(c.isalpha() for c in url) def count_digits(url): return sum(c.isdigit() for c in url) def count_special_chars(url): return sum(c in string.punctuation for c in url) def has_shortening_service(url): return int(bool(re.search(r"bit\.ly|goo\.gl|shorte\.st|t\.co|tinyurl", url))) def abnormal_url(url): net = urlparse(url).netloc return int(net in url) def secure_http(url): return int(urlparse(url).scheme == "https") def have_ip_address(url): host = urlparse(url).hostname or "" return int(bool(re.match(r"^(\d{1,3}\.){3}\d{1,3}$", host))) def featurize(url: str) -> pd.DataFrame: """Build a single-row DataFrame of features for `url`.""" d = { "url_len": get_url_length(url), "pri_domain": extract_pri_domain(url), "letters_count": count_letters(url), "digits_count": count_digits(url), "special_chars_count":count_special_chars(url), "shortened": has_shortening_service(url), "abnormal_url": abnormal_url(url), "secure_http": secure_http(url), "have_ip": have_ip_address(url), } df = pd.DataFrame([d]) # map pri_domain → code via your saved index df["pri_domain"] = pd.Categorical( df["pri_domain"], categories=pri_domain_index ).codes # fill any missing df = df.fillna(0).astype(np.float32) # reorder columns return df[feature_columns] # 3. Streamlit input url_input = st.text_input("URL", value="https://example.com") if st.button("Predict"): if not url_input.strip(): st.error("Please enter a URL.") else: # featurize & predict X_new = featurize(url_input) pred_idx = ensemble_model.predict(X_new)[0] probs = ensemble_model.predict_proba(X_new)[0] # map back to label name pred_label = label_index[pred_idx] st.subheader("Prediction") st.write(f"**{pred_label.upper()}**") st.subheader("Class probabilities") # build a tiny DataFrame for display dfp = pd.DataFrame({ "class": label_index, "probability": np.round(probs, 4) }).sort_values("probability", ascending=False) st.table(dfp)