Aybee5's picture
Updates requirements
0359608 verified
import streamlit as st
import pandas as pd
import re
import joblib
import re
from urllib.parse import urlparse
# Feature extraction function
def extract_features(url):
features = {}
try:
parsed = urlparse(url)
except:
# If URL parsing fails, use original url
parsed = None
# Basic counts
features["url_length"] = len(url)
features["num_dots"] = url.count(".")
features["num_hyphens"] = url.count("-")
features["num_at"] = url.count("@")
features["num_equal"] = url.count("=")
features["num_slash"] = url.count("/")
features["num_question"] = url.count("?")
features["num_ampersand"] = url.count("&")
features["num_percent"] = url.count("%")
features["num_underscore"] = url.count("_")
features["num_tilde"] = url.count("~")
features["num_semicolon"] = url.count(";")
# Check for www (case-insensitive, whole word)
features["has_www"] = 1 if re.search(r"\bwww\b", url.lower()) else 0
# Special characters count (excluding protocol and www prefix)
url_without_protocol = re.sub(r"^https?://(www\.)?", "", url, flags=re.IGNORECASE)
features["num_special_chars"] = len(
re.findall(r"[^a-zA-Z0-9]", url_without_protocol)
)
# TLD detection (more accurate pattern)
# Matches .com, .co.uk, .org, etc.
tld_matches = re.findall(r"\.[a-z]{2,}(?:\.[a-z]{2,})?(?=[/?#]|$)", url.lower())
features["num_tld"] = len(tld_matches)
# Extract actual TLD (rightmost domain component)
if parsed and parsed.netloc:
domain_parts = parsed.netloc.lower().split(".")
if len(domain_parts) >= 2:
# Get TLD length (e.g., 'com' = 3, 'co.uk' = 5)
features["tld_length"] = len(domain_parts[-1])
# Check for suspicious/uncommon TLDs
suspicious_tlds = [
".xyz",
".top",
".tk",
".ml",
".ga",
".cf",
".gq",
".pw",
".cc",
]
features["has_suspicious_tld"] = (
1 if any(tld in url.lower() for tld in suspicious_tlds) else 0
)
else:
features["tld_length"] = 0
features["has_suspicious_tld"] = 0
else:
features["tld_length"] = 0
features["has_suspicious_tld"] = 0
# Subdomain counting (more accurate)
if parsed and parsed.netloc:
domain_parts = parsed.netloc.split(".")
# Remove www from count if present
if domain_parts and domain_parts[0].lower() == "www":
domain_parts = domain_parts[1:]
# Subdomains = total parts - 2 (domain + tld) or - 3 for compound TLDs like .co.uk
# Simplified: count dots in netloc minus 1 (for main domain.tld)
features["num_subdomains"] = max(0, parsed.netloc.count(".") - 1)
else:
# Fallback to original logic
features["num_subdomains"] = max(0, url.count(".") - 1)
# ========== NEW: Main Domain Structure Analysis ==========
if parsed and parsed.netloc:
netloc = parsed.netloc.lower()
# Remove port if present
netloc_no_port = netloc.split(":")[0]
domain_parts = netloc_no_port.split(".")
# Extract main domain (second-level domain)
if len(domain_parts) >= 2:
# Skip www if present
if domain_parts[0] == "www":
main_domain = (
domain_parts[1] if len(domain_parts) > 2 else domain_parts[0]
)
else:
main_domain = domain_parts[-2] # e.g., 'google' from 'www.google.com'
# Main domain features
features["domain_length"] = len(main_domain)
features["domain_has_digits"] = (
1 if any(c.isdigit() for c in main_domain) else 0
)
features["domain_digit_count"] = sum(c.isdigit() for c in main_domain)
features["domain_has_hyphen"] = 1 if "-" in main_domain else 0
features["domain_hyphen_count"] = main_domain.count("-")
# Domain entropy (randomness measure)
def calculate_entropy(text):
from collections import Counter
import math
if not text:
return 0
counts = Counter(text)
length = len(text)
entropy = -sum(
(count / length) * math.log2(count / length)
for count in counts.values()
)
return entropy
features["domain_entropy"] = calculate_entropy(main_domain)
# Vowel/consonant ratio in domain (random domains often have unusual ratios)
vowels = "aeiou"
vowel_count = sum(1 for c in main_domain if c in vowels)
consonant_count = sum(
1 for c in main_domain if c.isalpha() and c not in vowels
)
total_letters = vowel_count + consonant_count
features["domain_vowel_ratio"] = (
vowel_count / total_letters if total_letters > 0 else 0
)
# Check for repeated characters (e.g., 'gooogle', 'payppal')
max_consecutive = max(
(
len(list(group))
for char, group in __import__("itertools").groupby(main_domain)
),
default=0,
)
features["domain_max_consecutive_chars"] = max_consecutive
# Check if domain name mimics popular brands (typosquatting detection)
popular_brands = [
"google",
"facebook",
"paypal",
"amazon",
"microsoft",
"apple",
"netflix",
"instagram",
"twitter",
"linkedin",
"ebay",
"yahoo",
]
# Calculate minimum edit distance to popular brands
def levenshtein_distance(s1, s2):
if len(s1) < len(s2):
return levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
min_brand_distance = min(
(levenshtein_distance(main_domain, brand) for brand in popular_brands),
default=10,
)
features["min_brand_edit_distance"] = min_brand_distance
features["is_brand_typosquat"] = 1 if 0 < min_brand_distance <= 2 else 0
else:
features["domain_length"] = 0
features["domain_has_digits"] = 0
features["domain_digit_count"] = 0
features["domain_has_hyphen"] = 0
features["domain_hyphen_count"] = 0
features["domain_entropy"] = 0
features["domain_vowel_ratio"] = 0
features["domain_max_consecutive_chars"] = 0
features["min_brand_edit_distance"] = 10
features["is_brand_typosquat"] = 0
else:
features["domain_length"] = 0
features["domain_has_digits"] = 0
features["domain_digit_count"] = 0
features["domain_has_hyphen"] = 0
features["domain_hyphen_count"] = 0
features["domain_entropy"] = 0
features["domain_vowel_ratio"] = 0
features["domain_max_consecutive_chars"] = 0
features["min_brand_edit_distance"] = 10
features["is_brand_typosquat"] = 0
# ========== END NEW FEATURES ==========
# Character type counts
features["num_numeric"] = len(re.findall(r"\d", url))
features["num_letters"] = len(re.findall(r"[a-zA-Z]", url))
features["num_uppercase"] = len(re.findall(r"[A-Z]", url))
features["num_lowercase"] = len(re.findall(r"[a-z]", url))
# Ratios (avoid division by zero)
url_len = features["url_length"] if features["url_length"] > 0 else 1
features["digit_to_length_ratio"] = features["num_numeric"] / url_len
features["letters_to_length_ratio"] = features["num_letters"] / url_len
features["special_to_length_ratio"] = features["num_special_chars"] / url_len
features["hyphens_to_length_ratio"] = features["num_hyphens"] / url_len
features["dots_to_length_ratio"] = features["num_dots"] / url_len
# Path and query length
if parsed:
features["path_length"] = len(parsed.path) if parsed.path else 0
features["query_length"] = len(parsed.query) if parsed.query else 0
features["hostname_length"] = len(parsed.netloc) if parsed.netloc else 0
# NEW: Path depth (number of directories)
features["path_depth"] = (
len([p for p in parsed.path.split("/") if p]) if parsed.path else 0
)
# NEW: Number of parameters in query string
features["num_query_params"] = (
len(parsed.query.split("&")) if parsed.query else 0
)
# NEW: Fragment presence
features["has_fragment"] = 1 if parsed.fragment else 0
# NEW: Port presence and suspicious ports
if parsed.port:
features["has_custom_port"] = 1
# Common suspicious ports for phishing
features["has_suspicious_port"] = (
1 if parsed.port in [8080, 8888, 3000, 4443, 5000] else 0
)
else:
features["has_custom_port"] = 0
features["has_suspicious_port"] = 0
else:
features["path_length"] = 0
features["query_length"] = 0
features["hostname_length"] = 0
features["path_depth"] = 0
features["num_query_params"] = 0
features["has_fragment"] = 0
features["has_custom_port"] = 0
features["has_suspicious_port"] = 0
# Suspicious keywords (case-insensitive search)
suspicious_keywords = [
"login",
"verify",
"update",
"bank",
"secure",
"account",
"free",
"bonus",
"click",
"offer",
"xml",
"php",
"exe",
"zip",
"wp",
"paypal",
"admin",
"amp",
"signin",
"password",
"confirm",
"suspended",
"billing",
"unlock",
"validate",
"authenticate",
]
url_lower = url.lower()
features["num_suspicious_words"] = sum(
1 for kw in suspicious_keywords if kw in url_lower
)
features["has_suspicious_words"] = 1 if features["num_suspicious_words"] > 0 else 0
# TLD checks (case-insensitive)
popular_tlds = [".com", ".org", ".net", ".edu", ".gov"]
features["has_popular_tld"] = (
1 if any(tld in url_lower for tld in popular_tlds) else 0
)
# Protocol checks
features["has_https"] = 1 if url.lower().startswith("https://") else 0
features["has_http"] = 1 if url.lower().startswith("http://") else 0
# IP address in URL (potential phishing indicator)
features["has_ip_address"] = 1 if re.search(r"(\d{1,3}\.){3}\d{1,3}", url) else 0
# Shortened URL services
shorteners = ["bit.ly", "tinyurl", "goo.gl", "t.co", "ow.ly", "is.gd", "buff.ly"]
features["is_shortened"] = (
1 if any(short in url_lower for short in shorteners) else 0
)
# NEW: Double slash in path (suspicious)
features["has_double_slash"] = 1 if "//" in url.replace("://", "") else 0
# NEW: Prefix/suffix separator (e.g., paypal-secure.com)
if parsed and parsed.netloc:
features["has_prefix_suffix"] = 1 if "-" in parsed.netloc else 0
else:
features["has_prefix_suffix"] = 0
return features
# Load model and prepare scaler
@st.cache_resource
def load_model_and_scaler():
# Load the trained model
model = joblib.load("rf_tuned_model.joblib")
return model
def check_url(url, model):
# Extract features
predict_features = extract_features(url)
predict_features_df = pd.DataFrame([predict_features])
# Transform with scaler
predict_features_scaled = predict_features_df
# Make prediction
prediction = model.predict(predict_features_scaled)
# Get prediction probabilities
prediction_proba = model.predict_proba(predict_features_scaled)
return prediction[0], prediction_proba[0]
def main():
st.set_page_config(
page_title="Phishing URL Detector", page_icon="πŸ”’", layout="centered"
)
st.title("πŸ”’ Phishing URL Detector")
st.write(
"Enter a URL below to check if it's legitimate or potentially a phishing attempt."
)
# Load model and scaler
try:
model = load_model_and_scaler()
except Exception as e:
st.error(f"Error loading model: {e}")
return
# URL input
url = st.text_input("Enter a URL to check:", placeholder="https://example.com")
# Use session state URL if set
if "url" in st.session_state and st.session_state.url:
url = st.session_state.url
# Check button
if st.button("Check URL", type="primary"):
if url:
with st.spinner("Analyzing URL..."):
prediction, prediction_proba = check_url(url, model)
st.divider()
# Display result
if prediction == 1:
st.error("🚨 **Warning! This looks like a Phishing URL.**")
st.write(
"This URL exhibits characteristics commonly found in phishing attempts. Please be cautious!"
)
# Display confidence for phishing
phishing_confidence = prediction_proba[1] * 100
st.metric("Phishing Confidence", f"{phishing_confidence:.2f}%")
else:
st.success("βœ… **Safe! This seems like a Legitimate URL.**")
st.write("This URL appears to be legitimate based on our analysis.")
# Display confidence for legitimate
legitimate_confidence = prediction_proba[0] * 100
st.metric("Legitimate Confidence", f"{legitimate_confidence:.2f}%")
# Show probability breakdown
st.divider()
st.subheader("πŸ“Š Prediction Probabilities")
col1, col2 = st.columns(2)
with col1:
st.metric("Legitimate", f"{prediction_proba[0] * 100:.2f}%")
with col2:
st.metric("Phishing", f"{prediction_proba[1] * 100:.2f}%")
# Visual progress bars for probabilities
st.write("**Probability Distribution:**")
st.progress(
prediction_proba[0],
text=f"Legitimate: {prediction_proba[0] * 100:.2f}%",
)
st.progress(
prediction_proba[1],
text=f"Phishing: {prediction_proba[1] * 100:.2f}%",
)
# Show some extracted features
with st.expander("View URL Analysis Details"):
features = extract_features(url)
# Basic Metrics
st.subheader("πŸ“Š Basic Metrics")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("URL Length", features["url_length"])
st.metric("Dots", features["num_dots"])
with col2:
st.metric("Hyphens", features["num_hyphens"])
st.metric("Slashes", features["num_slash"])
with col3:
st.metric("@ Symbols", features["num_at"])
st.metric("Subdomains", features["num_subdomains"])
with col4:
st.metric("Special Chars", features["num_special_chars"])
st.metric("Numeric Chars", features["num_numeric"])
st.divider()
# Security Indicators
st.subheader("πŸ” Security Indicators")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric(
"HTTPS", "βœ… Yes" if features["has_https"] else "❌ No"
)
with col2:
st.metric(
"Popular TLD",
"βœ… Yes" if features["has_popular_tld"] else "❌ No",
)
with col3:
st.metric(
"IP Address",
"⚠️ Yes" if features["has_ip_address"] else "βœ… No",
)
with col4:
st.metric(
"URL Shortener",
"⚠️ Yes" if features["is_shortened"] else "βœ… No",
)
st.divider()
# Suspicious Indicators
st.subheader("⚠️ Suspicious Indicators")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Suspicious Words", features["num_suspicious_words"])
with col2:
st.metric("Domain Entropy", f"{features['domain_entropy']:.2f}")
with col3:
st.metric("Has WWW", "Yes" if features["has_www"] else "No")
st.divider()
# Length Breakdown
st.subheader("πŸ“ Component Lengths")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Hostname Length", features["hostname_length"])
with col2:
st.metric("Path Length", features["path_length"])
with col3:
st.metric("Query Length", features["query_length"])
st.divider()
# Ratios
st.subheader("πŸ“ˆ Feature Ratios")
col1, col2, col3 = st.columns(3)
with col1:
st.metric(
"Digit Ratio", f"{features['digit_to_length_ratio']:.3f}"
)
st.metric(
"Letter Ratio", f"{features['letters_to_length_ratio']:.3f}"
)
with col2:
st.metric(
"Special Char Ratio",
f"{features['special_to_length_ratio']:.3f}",
)
st.metric(
"Hyphen Ratio", f"{features['hyphens_to_length_ratio']:.3f}"
)
with col3:
st.metric(
"Dot Ratio", f"{features['dots_to_length_ratio']:.3f}"
)
else:
st.warning("Please enter a URL to check.")
# Footer
st.divider()
st.caption(
"⚠️ This tool uses machine learning to detect potential phishing URLs. Always exercise caution when clicking on unfamiliar links."
)
if __name__ == "__main__":
main()