Spaces:

MrUtakata
/

MUD_MLT

Sleeping

App Files Files Community

MrUtakata commited on Apr 17, 2025

Commit

cb59e97

verified ·

1 Parent(s): 7e61a79

Create app.py

Browse files

Files changed (1) hide show

app.py +109 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# app.py
+import streamlit as st
+import joblib
+import pandas as pd
+import numpy as np
+import re
+import string
+from urllib.parse import urlparse
+st.set_page_config(page_title="Malicious URL Detection", layout="centered")
+st.title("🔗 Malicious URL Detection")
+st.write("Enter a URL below and the model will predict whether it is benign or malicious.")
+# 1. Load artifacts
+ensemble_model       = joblib.load("ensemble_model.joblib")
+feature_columns = joblib.load("feature_columns.joblib")   # list of feature names
+label_index     = joblib.load("label_index.joblib")       # array of label names
+pri_domain_index= joblib.load("pri_domain_index.joblib")  # array of allowed domains
+# 2. Feature extraction functions (same as training)
+def get_url_length(url):
+    for prefix in ("http://","https://"):
+        if url.startswith(prefix):
+            url = url[len(prefix):]
+    url = url.replace("www.","")
+    return len(url)
+def extract_pri_domain(url):
+    try:
+        hostname = urlparse(url).hostname or ""
+        parts = hostname.split(".")
+        if len(parts) >= 2:
+            return ".".join(parts[-2:])
+        return hostname
+    except:
+        return ""
+def count_letters(url):
+    return sum(c.isalpha() for c in url)
+def count_digits(url):
+    return sum(c.isdigit() for c in url)
+def count_special_chars(url):
+    return sum(c in string.punctuation for c in url)
+def has_shortening_service(url):
+    return int(bool(re.search(r"bit\.ly|goo\.gl|shorte\.st|t\.co|tinyurl", url)))
+def abnormal_url(url):
+    net = urlparse(url).netloc
+    return int(net in url)
+def secure_http(url):
+    return int(urlparse(url).scheme == "https")
+def have_ip_address(url):
+    host = urlparse(url).hostname or ""
+    return int(bool(re.match(r"^(\d{1,3}\.){3}\d{1,3}$", host)))
+def featurize(url: str) -> pd.DataFrame:
+    """Build a single-row DataFrame of features for `url`."""
+    d = {
+        "url_len":            get_url_length(url),
+        "pri_domain":         extract_pri_domain(url),
+        "letters_count":      count_letters(url),
+        "digits_count":       count_digits(url),
+        "special_chars_count":count_special_chars(url),
+        "shortened":          has_shortening_service(url),
+        "abnormal_url":       abnormal_url(url),
+        "secure_http":        secure_http(url),
+        "have_ip":            have_ip_address(url),
+    }
+    df = pd.DataFrame([d])
+    # map pri_domain → code via your saved index
+    df["pri_domain"] = pd.Categorical(
+        df["pri_domain"], categories=pri_domain_index
+    ).codes
+    # fill any missing
+    df = df.fillna(0).astype(np.float32)
+    # reorder columns
+    return df[feature_columns]
+# 3. Streamlit input
+url_input = st.text_input("URL", value="https://example.com")
+if st.button("Predict"):
+    if not url_input.strip():
+        st.error("Please enter a URL.")
+    else:
+        # featurize & predict
+        X_new = featurize(url_input)
+        pred_idx = ensemble_model.predict(X_new)[0]
+        probs = ensemble_model.predict_proba(X_new)[0]
+        # map back to label name
+        pred_label = label_index[pred_idx]
+        st.subheader("Prediction")
+        st.write(f"**{pred_label.upper()}**")
+        st.subheader("Class probabilities")
+        # build a tiny DataFrame for display
+        dfp = pd.DataFrame({
+            "class": label_index,
+            "probability": np.round(probs, 4)
+        }).sort_values("probability", ascending=False)
+        st.table(dfp)