Spaces:

Aybee5
/

phishing-url-detector

Sleeping

App Files Files Community

phishing-url-detector / main.py

Aybee5

Updates requirements

0359608 verified 5 months ago

raw

history blame contribute delete

20 kB

	import streamlit as st
	import pandas as pd
	import re
	import joblib
	import re
	from urllib.parse import urlparse


	# Feature extraction function
	def extract_features(url):
	features = {}

	try:
	parsed = urlparse(url)
	except:
	# If URL parsing fails, use original url
	parsed = None

	# Basic counts
	features["url_length"] = len(url)
	features["num_dots"] = url.count(".")
	features["num_hyphens"] = url.count("-")
	features["num_at"] = url.count("@")
	features["num_equal"] = url.count("=")
	features["num_slash"] = url.count("/")
	features["num_question"] = url.count("?")
	features["num_ampersand"] = url.count("&")
	features["num_percent"] = url.count("%")
	features["num_underscore"] = url.count("_")
	features["num_tilde"] = url.count("~")
	features["num_semicolon"] = url.count(";")

	# Check for www (case-insensitive, whole word)
	features["has_www"] = 1 if re.search(r"\bwww\b", url.lower()) else 0

	# Special characters count (excluding protocol and www prefix)
	url_without_protocol = re.sub(r"^https?://(www\.)?", "", url, flags=re.IGNORECASE)
	features["num_special_chars"] = len(
	re.findall(r"[^a-zA-Z0-9]", url_without_protocol)
	)

	# TLD detection (more accurate pattern)
	# Matches .com, .co.uk, .org, etc.
	tld_matches = re.findall(r"\.[a-z]{2,}(?:\.[a-z]{2,})?(?=[/?#]\|$)", url.lower())
	features["num_tld"] = len(tld_matches)

	# Extract actual TLD (rightmost domain component)
	if parsed and parsed.netloc:
	domain_parts = parsed.netloc.lower().split(".")
	if len(domain_parts) >= 2:
	# Get TLD length (e.g., 'com' = 3, 'co.uk' = 5)
	features["tld_length"] = len(domain_parts[-1])
	# Check for suspicious/uncommon TLDs
	suspicious_tlds = [
	".xyz",
	".top",
	".tk",
	".ml",
	".ga",
	".cf",
	".gq",
	".pw",
	".cc",
	]
	features["has_suspicious_tld"] = (
	1 if any(tld in url.lower() for tld in suspicious_tlds) else 0
	)
	else:
	features["tld_length"] = 0
	features["has_suspicious_tld"] = 0
	else:
	features["tld_length"] = 0
	features["has_suspicious_tld"] = 0

	# Subdomain counting (more accurate)
	if parsed and parsed.netloc:
	domain_parts = parsed.netloc.split(".")
	# Remove www from count if present
	if domain_parts and domain_parts[0].lower() == "www":
	domain_parts = domain_parts[1:]
	# Subdomains = total parts - 2 (domain + tld) or - 3 for compound TLDs like .co.uk
	# Simplified: count dots in netloc minus 1 (for main domain.tld)
	features["num_subdomains"] = max(0, parsed.netloc.count(".") - 1)
	else:
	# Fallback to original logic
	features["num_subdomains"] = max(0, url.count(".") - 1)

	# ========== NEW: Main Domain Structure Analysis ==========
	if parsed and parsed.netloc:
	netloc = parsed.netloc.lower()
	# Remove port if present
	netloc_no_port = netloc.split(":")[0]
	domain_parts = netloc_no_port.split(".")

	# Extract main domain (second-level domain)
	if len(domain_parts) >= 2:
	# Skip www if present
	if domain_parts[0] == "www":
	main_domain = (
	domain_parts[1] if len(domain_parts) > 2 else domain_parts[0]
	)
	else:
	main_domain = domain_parts[-2] # e.g., 'google' from 'www.google.com'

	# Main domain features
	features["domain_length"] = len(main_domain)
	features["domain_has_digits"] = (
	1 if any(c.isdigit() for c in main_domain) else 0
	)
	features["domain_digit_count"] = sum(c.isdigit() for c in main_domain)
	features["domain_has_hyphen"] = 1 if "-" in main_domain else 0
	features["domain_hyphen_count"] = main_domain.count("-")

	# Domain entropy (randomness measure)
	def calculate_entropy(text):
	from collections import Counter
	import math

	if not text:
	return 0
	counts = Counter(text)
	length = len(text)
	entropy = -sum(
	(count / length) * math.log2(count / length)
	for count in counts.values()
	)
	return entropy

	features["domain_entropy"] = calculate_entropy(main_domain)

	# Vowel/consonant ratio in domain (random domains often have unusual ratios)
	vowels = "aeiou"
	vowel_count = sum(1 for c in main_domain if c in vowels)
	consonant_count = sum(
	1 for c in main_domain if c.isalpha() and c not in vowels
	)
	total_letters = vowel_count + consonant_count
	features["domain_vowel_ratio"] = (
	vowel_count / total_letters if total_letters > 0 else 0
	)

	# Check for repeated characters (e.g., 'gooogle', 'payppal')
	max_consecutive = max(
	(
	len(list(group))
	for char, group in __import__("itertools").groupby(main_domain)
	),
	default=0,
	)
	features["domain_max_consecutive_chars"] = max_consecutive

	# Check if domain name mimics popular brands (typosquatting detection)
	popular_brands = [
	"google",
	"facebook",
	"paypal",
	"amazon",
	"microsoft",
	"apple",
	"netflix",
	"instagram",
	"twitter",
	"linkedin",
	"ebay",
	"yahoo",
	]

	# Calculate minimum edit distance to popular brands
	def levenshtein_distance(s1, s2):
	if len(s1) < len(s2):
	return levenshtein_distance(s2, s1)
	if len(s2) == 0:
	return len(s1)
	previous_row = range(len(s2) + 1)
	for i, c1 in enumerate(s1):
	current_row = [i + 1]
	for j, c2 in enumerate(s2):
	insertions = previous_row[j + 1] + 1
	deletions = current_row[j] + 1
	substitutions = previous_row[j] + (c1 != c2)
	current_row.append(min(insertions, deletions, substitutions))
	previous_row = current_row
	return previous_row[-1]

	min_brand_distance = min(
	(levenshtein_distance(main_domain, brand) for brand in popular_brands),
	default=10,
	)
	features["min_brand_edit_distance"] = min_brand_distance
	features["is_brand_typosquat"] = 1 if 0 < min_brand_distance <= 2 else 0

	else:
	features["domain_length"] = 0
	features["domain_has_digits"] = 0
	features["domain_digit_count"] = 0
	features["domain_has_hyphen"] = 0
	features["domain_hyphen_count"] = 0
	features["domain_entropy"] = 0
	features["domain_vowel_ratio"] = 0
	features["domain_max_consecutive_chars"] = 0
	features["min_brand_edit_distance"] = 10
	features["is_brand_typosquat"] = 0
	else:
	features["domain_length"] = 0
	features["domain_has_digits"] = 0
	features["domain_digit_count"] = 0
	features["domain_has_hyphen"] = 0
	features["domain_hyphen_count"] = 0
	features["domain_entropy"] = 0
	features["domain_vowel_ratio"] = 0
	features["domain_max_consecutive_chars"] = 0
	features["min_brand_edit_distance"] = 10
	features["is_brand_typosquat"] = 0

	# ========== END NEW FEATURES ==========

	# Character type counts
	features["num_numeric"] = len(re.findall(r"\d", url))
	features["num_letters"] = len(re.findall(r"[a-zA-Z]", url))
	features["num_uppercase"] = len(re.findall(r"[A-Z]", url))
	features["num_lowercase"] = len(re.findall(r"[a-z]", url))

	# Ratios (avoid division by zero)
	url_len = features["url_length"] if features["url_length"] > 0 else 1
	features["digit_to_length_ratio"] = features["num_numeric"] / url_len
	features["letters_to_length_ratio"] = features["num_letters"] / url_len
	features["special_to_length_ratio"] = features["num_special_chars"] / url_len
	features["hyphens_to_length_ratio"] = features["num_hyphens"] / url_len
	features["dots_to_length_ratio"] = features["num_dots"] / url_len

	# Path and query length
	if parsed:
	features["path_length"] = len(parsed.path) if parsed.path else 0
	features["query_length"] = len(parsed.query) if parsed.query else 0
	features["hostname_length"] = len(parsed.netloc) if parsed.netloc else 0

	# NEW: Path depth (number of directories)
	features["path_depth"] = (
	len([p for p in parsed.path.split("/") if p]) if parsed.path else 0
	)

	# NEW: Number of parameters in query string
	features["num_query_params"] = (
	len(parsed.query.split("&")) if parsed.query else 0
	)

	# NEW: Fragment presence
	features["has_fragment"] = 1 if parsed.fragment else 0

	# NEW: Port presence and suspicious ports
	if parsed.port:
	features["has_custom_port"] = 1
	# Common suspicious ports for phishing
	features["has_suspicious_port"] = (
	1 if parsed.port in [8080, 8888, 3000, 4443, 5000] else 0
	)
	else:
	features["has_custom_port"] = 0
	features["has_suspicious_port"] = 0
	else:
	features["path_length"] = 0
	features["query_length"] = 0
	features["hostname_length"] = 0
	features["path_depth"] = 0
	features["num_query_params"] = 0
	features["has_fragment"] = 0
	features["has_custom_port"] = 0
	features["has_suspicious_port"] = 0

	# Suspicious keywords (case-insensitive search)
	suspicious_keywords = [
	"login",
	"verify",
	"update",
	"bank",
	"secure",
	"account",
	"free",
	"bonus",
	"click",
	"offer",
	"xml",
	"php",
	"exe",
	"zip",
	"wp",
	"paypal",
	"admin",
	"amp",
	"signin",
	"password",
	"confirm",
	"suspended",
	"billing",
	"unlock",
	"validate",
	"authenticate",
	]
	url_lower = url.lower()
	features["num_suspicious_words"] = sum(
	1 for kw in suspicious_keywords if kw in url_lower
	)
	features["has_suspicious_words"] = 1 if features["num_suspicious_words"] > 0 else 0

	# TLD checks (case-insensitive)
	popular_tlds = [".com", ".org", ".net", ".edu", ".gov"]
	features["has_popular_tld"] = (
	1 if any(tld in url_lower for tld in popular_tlds) else 0
	)

	# Protocol checks
	features["has_https"] = 1 if url.lower().startswith("https://") else 0
	features["has_http"] = 1 if url.lower().startswith("http://") else 0

	# IP address in URL (potential phishing indicator)
	features["has_ip_address"] = 1 if re.search(r"(\d{1,3}\.){3}\d{1,3}", url) else 0

	# Shortened URL services
	shorteners = ["bit.ly", "tinyurl", "goo.gl", "t.co", "ow.ly", "is.gd", "buff.ly"]
	features["is_shortened"] = (
	1 if any(short in url_lower for short in shorteners) else 0
	)

	# NEW: Double slash in path (suspicious)
	features["has_double_slash"] = 1 if "//" in url.replace("://", "") else 0

	# NEW: Prefix/suffix separator (e.g., paypal-secure.com)
	if parsed and parsed.netloc:
	features["has_prefix_suffix"] = 1 if "-" in parsed.netloc else 0
	else:
	features["has_prefix_suffix"] = 0

	return features


	# Load model and prepare scaler
	@st.cache_resource
	def load_model_and_scaler():
	# Load the trained model
	model = joblib.load("rf_tuned_model.joblib")

	return model


	def check_url(url, model):
	# Extract features
	predict_features = extract_features(url)
	predict_features_df = pd.DataFrame([predict_features])

	# Transform with scaler
	predict_features_scaled = predict_features_df

	# Make prediction
	prediction = model.predict(predict_features_scaled)

	# Get prediction probabilities
	prediction_proba = model.predict_proba(predict_features_scaled)

	return prediction[0], prediction_proba[0]


	def main():
	st.set_page_config(
	page_title="Phishing URL Detector", page_icon="🔒", layout="centered"
	)

	st.title("🔒 Phishing URL Detector")
	st.write(
	"Enter a URL below to check if it's legitimate or potentially a phishing attempt."
	)

	# Load model and scaler
	try:
	model = load_model_and_scaler()
	except Exception as e:
	st.error(f"Error loading model: {e}")
	return

	# URL input
	url = st.text_input("Enter a URL to check:", placeholder="https://example.com")

	# Use session state URL if set
	if "url" in st.session_state and st.session_state.url:
	url = st.session_state.url

	# Check button
	if st.button("Check URL", type="primary"):
	if url:
	with st.spinner("Analyzing URL..."):
	prediction, prediction_proba = check_url(url, model)

	st.divider()

	# Display result
	if prediction == 1:
	st.error("🚨 Warning! This looks like a Phishing URL.")
	st.write(
	"This URL exhibits characteristics commonly found in phishing attempts. Please be cautious!"
	)
	# Display confidence for phishing
	phishing_confidence = prediction_proba[1] * 100
	st.metric("Phishing Confidence", f"{phishing_confidence:.2f}%")
	else:
	st.success("✅ Safe! This seems like a Legitimate URL.")
	st.write("This URL appears to be legitimate based on our analysis.")
	# Display confidence for legitimate
	legitimate_confidence = prediction_proba[0] * 100
	st.metric("Legitimate Confidence", f"{legitimate_confidence:.2f}%")

	# Show probability breakdown
	st.divider()
	st.subheader("📊 Prediction Probabilities")
	col1, col2 = st.columns(2)
	with col1:
	st.metric("Legitimate", f"{prediction_proba[0] * 100:.2f}%")
	with col2:
	st.metric("Phishing", f"{prediction_proba[1] * 100:.2f}%")

	# Visual progress bars for probabilities
	st.write("Probability Distribution:")
	st.progress(
	prediction_proba[0],
	text=f"Legitimate: {prediction_proba[0] * 100:.2f}%",
	)
	st.progress(
	prediction_proba[1],
	text=f"Phishing: {prediction_proba[1] * 100:.2f}%",
	)

	# Show some extracted features
	with st.expander("View URL Analysis Details"):
	features = extract_features(url)

	# Basic Metrics
	st.subheader("📊 Basic Metrics")
	col1, col2, col3, col4 = st.columns(4)
	with col1:
	st.metric("URL Length", features["url_length"])
	st.metric("Dots", features["num_dots"])
	with col2:
	st.metric("Hyphens", features["num_hyphens"])
	st.metric("Slashes", features["num_slash"])
	with col3:
	st.metric("@ Symbols", features["num_at"])
	st.metric("Subdomains", features["num_subdomains"])
	with col4:
	st.metric("Special Chars", features["num_special_chars"])
	st.metric("Numeric Chars", features["num_numeric"])

	st.divider()

	# Security Indicators
	st.subheader("🔐 Security Indicators")
	col1, col2, col3, col4 = st.columns(4)
	with col1:
	st.metric(
	"HTTPS", "✅ Yes" if features["has_https"] else "❌ No"
	)
	with col2:
	st.metric(
	"Popular TLD",
	"✅ Yes" if features["has_popular_tld"] else "❌ No",
	)
	with col3:
	st.metric(
	"IP Address",
	"⚠️ Yes" if features["has_ip_address"] else "✅ No",
	)
	with col4:
	st.metric(
	"URL Shortener",
	"⚠️ Yes" if features["is_shortened"] else "✅ No",
	)

	st.divider()

	# Suspicious Indicators
	st.subheader("⚠️ Suspicious Indicators")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Suspicious Words", features["num_suspicious_words"])
	with col2:
	st.metric("Domain Entropy", f"{features['domain_entropy']:.2f}")
	with col3:
	st.metric("Has WWW", "Yes" if features["has_www"] else "No")

	st.divider()

	# Length Breakdown
	st.subheader("📏 Component Lengths")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Hostname Length", features["hostname_length"])
	with col2:
	st.metric("Path Length", features["path_length"])
	with col3:
	st.metric("Query Length", features["query_length"])

	st.divider()

	# Ratios
	st.subheader("📈 Feature Ratios")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric(
	"Digit Ratio", f"{features['digit_to_length_ratio']:.3f}"
	)
	st.metric(
	"Letter Ratio", f"{features['letters_to_length_ratio']:.3f}"
	)
	with col2:
	st.metric(
	"Special Char Ratio",
	f"{features['special_to_length_ratio']:.3f}",
	)
	st.metric(
	"Hyphen Ratio", f"{features['hyphens_to_length_ratio']:.3f}"
	)
	with col3:
	st.metric(
	"Dot Ratio", f"{features['dots_to_length_ratio']:.3f}"
	)
	else:
	st.warning("Please enter a URL to check.")

	# Footer
	st.divider()
	st.caption(
	"⚠️ This tool uses machine learning to detect potential phishing URLs. Always exercise caution when clicking on unfamiliar links."
	)


	if __name__ == "__main__":
	main()