# app.py
import streamlit as st
import joblib
import pandas as pd
import numpy as np
import re
import string
from urllib.parse import urlparse

st.set_page_config(page_title="Malicious URL Detection", layout="centered")

st.title("🔗 Malicious URL Detection")
st.write("Enter a URL below and the model will predict whether it is benign or malicious.")

# 1. Load artifacts
ensemble_model       = joblib.load("ensemble_model.joblib")
feature_columns = joblib.load("feature_columns.joblib")   # list of feature names
label_index     = joblib.load("label_index.joblib")       # array of label names
pri_domain_index= joblib.load("pri_domain_index.joblib")  # array of allowed domains

# 2. Feature extraction functions (same as training)
def get_url_length(url):
    for prefix in ("http://","https://"):
        if url.startswith(prefix):
            url = url[len(prefix):]
    url = url.replace("www.","")
    return len(url)

def extract_pri_domain(url):
    try:
        hostname = urlparse(url).hostname or ""
        parts = hostname.split(".")
        if len(parts) >= 2:
            return ".".join(parts[-2:])
        return hostname
    except:
        return ""

def count_letters(url):
    return sum(c.isalpha() for c in url)

def count_digits(url):
    return sum(c.isdigit() for c in url)

def count_special_chars(url):
    return sum(c in string.punctuation for c in url)

def has_shortening_service(url):
    return int(bool(re.search(r"bit\.ly|goo\.gl|shorte\.st|t\.co|tinyurl", url)))

def abnormal_url(url):
    net = urlparse(url).netloc
    return int(net in url)

def secure_http(url):
    return int(urlparse(url).scheme == "https")

def have_ip_address(url):
    host = urlparse(url).hostname or ""
    return int(bool(re.match(r"^(\d{1,3}\.){3}\d{1,3}$", host)))

def featurize(url: str) -> pd.DataFrame:
    """Build a single-row DataFrame of features for `url`."""
    d = {
        "url_len":            get_url_length(url),
        "pri_domain":         extract_pri_domain(url),
        "letters_count":      count_letters(url),
        "digits_count":       count_digits(url),
        "special_chars_count":count_special_chars(url),
        "shortened":          has_shortening_service(url),
        "abnormal_url":       abnormal_url(url),
        "secure_http":        secure_http(url),
        "have_ip":            have_ip_address(url),
    }
    df = pd.DataFrame([d])
    # map pri_domain → code via your saved index
    df["pri_domain"] = pd.Categorical(
        df["pri_domain"], categories=pri_domain_index
    ).codes
    # fill any missing
    df = df.fillna(0).astype(np.float32)
    # reorder columns
    return df[feature_columns]

# 3. Streamlit input
url_input = st.text_input("URL", value="https://example.com")
if st.button("Predict"):
    if not url_input.strip():
        st.error("Please enter a URL.")
    else:
        # featurize & predict
        X_new = featurize(url_input)
        pred_idx = ensemble_model.predict(X_new)[0]
        probs = ensemble_model.predict_proba(X_new)[0]

        # map back to label name
        pred_label = label_index[pred_idx]

        st.subheader("Prediction")
        st.write(f"**{pred_label.upper()}**")

        st.subheader("Class probabilities")
        # build a tiny DataFrame for display
        dfp = pd.DataFrame({
            "class": label_index,
            "probability": np.round(probs, 4)
        }).sort_values("probability", ascending=False)

        st.table(dfp)