Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import re | |
| import joblib | |
| import re | |
| from urllib.parse import urlparse | |
| # Feature extraction function | |
| def extract_features(url): | |
| features = {} | |
| try: | |
| parsed = urlparse(url) | |
| except: | |
| # If URL parsing fails, use original url | |
| parsed = None | |
| # Basic counts | |
| features["url_length"] = len(url) | |
| features["num_dots"] = url.count(".") | |
| features["num_hyphens"] = url.count("-") | |
| features["num_at"] = url.count("@") | |
| features["num_equal"] = url.count("=") | |
| features["num_slash"] = url.count("/") | |
| features["num_question"] = url.count("?") | |
| features["num_ampersand"] = url.count("&") | |
| features["num_percent"] = url.count("%") | |
| features["num_underscore"] = url.count("_") | |
| features["num_tilde"] = url.count("~") | |
| features["num_semicolon"] = url.count(";") | |
| # Check for www (case-insensitive, whole word) | |
| features["has_www"] = 1 if re.search(r"\bwww\b", url.lower()) else 0 | |
| # Special characters count (excluding protocol and www prefix) | |
| url_without_protocol = re.sub(r"^https?://(www\.)?", "", url, flags=re.IGNORECASE) | |
| features["num_special_chars"] = len( | |
| re.findall(r"[^a-zA-Z0-9]", url_without_protocol) | |
| ) | |
| # TLD detection (more accurate pattern) | |
| # Matches .com, .co.uk, .org, etc. | |
| tld_matches = re.findall(r"\.[a-z]{2,}(?:\.[a-z]{2,})?(?=[/?#]|$)", url.lower()) | |
| features["num_tld"] = len(tld_matches) | |
| # Extract actual TLD (rightmost domain component) | |
| if parsed and parsed.netloc: | |
| domain_parts = parsed.netloc.lower().split(".") | |
| if len(domain_parts) >= 2: | |
| # Get TLD length (e.g., 'com' = 3, 'co.uk' = 5) | |
| features["tld_length"] = len(domain_parts[-1]) | |
| # Check for suspicious/uncommon TLDs | |
| suspicious_tlds = [ | |
| ".xyz", | |
| ".top", | |
| ".tk", | |
| ".ml", | |
| ".ga", | |
| ".cf", | |
| ".gq", | |
| ".pw", | |
| ".cc", | |
| ] | |
| features["has_suspicious_tld"] = ( | |
| 1 if any(tld in url.lower() for tld in suspicious_tlds) else 0 | |
| ) | |
| else: | |
| features["tld_length"] = 0 | |
| features["has_suspicious_tld"] = 0 | |
| else: | |
| features["tld_length"] = 0 | |
| features["has_suspicious_tld"] = 0 | |
| # Subdomain counting (more accurate) | |
| if parsed and parsed.netloc: | |
| domain_parts = parsed.netloc.split(".") | |
| # Remove www from count if present | |
| if domain_parts and domain_parts[0].lower() == "www": | |
| domain_parts = domain_parts[1:] | |
| # Subdomains = total parts - 2 (domain + tld) or - 3 for compound TLDs like .co.uk | |
| # Simplified: count dots in netloc minus 1 (for main domain.tld) | |
| features["num_subdomains"] = max(0, parsed.netloc.count(".") - 1) | |
| else: | |
| # Fallback to original logic | |
| features["num_subdomains"] = max(0, url.count(".") - 1) | |
| # ========== NEW: Main Domain Structure Analysis ========== | |
| if parsed and parsed.netloc: | |
| netloc = parsed.netloc.lower() | |
| # Remove port if present | |
| netloc_no_port = netloc.split(":")[0] | |
| domain_parts = netloc_no_port.split(".") | |
| # Extract main domain (second-level domain) | |
| if len(domain_parts) >= 2: | |
| # Skip www if present | |
| if domain_parts[0] == "www": | |
| main_domain = ( | |
| domain_parts[1] if len(domain_parts) > 2 else domain_parts[0] | |
| ) | |
| else: | |
| main_domain = domain_parts[-2] # e.g., 'google' from 'www.google.com' | |
| # Main domain features | |
| features["domain_length"] = len(main_domain) | |
| features["domain_has_digits"] = ( | |
| 1 if any(c.isdigit() for c in main_domain) else 0 | |
| ) | |
| features["domain_digit_count"] = sum(c.isdigit() for c in main_domain) | |
| features["domain_has_hyphen"] = 1 if "-" in main_domain else 0 | |
| features["domain_hyphen_count"] = main_domain.count("-") | |
| # Domain entropy (randomness measure) | |
| def calculate_entropy(text): | |
| from collections import Counter | |
| import math | |
| if not text: | |
| return 0 | |
| counts = Counter(text) | |
| length = len(text) | |
| entropy = -sum( | |
| (count / length) * math.log2(count / length) | |
| for count in counts.values() | |
| ) | |
| return entropy | |
| features["domain_entropy"] = calculate_entropy(main_domain) | |
| # Vowel/consonant ratio in domain (random domains often have unusual ratios) | |
| vowels = "aeiou" | |
| vowel_count = sum(1 for c in main_domain if c in vowels) | |
| consonant_count = sum( | |
| 1 for c in main_domain if c.isalpha() and c not in vowels | |
| ) | |
| total_letters = vowel_count + consonant_count | |
| features["domain_vowel_ratio"] = ( | |
| vowel_count / total_letters if total_letters > 0 else 0 | |
| ) | |
| # Check for repeated characters (e.g., 'gooogle', 'payppal') | |
| max_consecutive = max( | |
| ( | |
| len(list(group)) | |
| for char, group in __import__("itertools").groupby(main_domain) | |
| ), | |
| default=0, | |
| ) | |
| features["domain_max_consecutive_chars"] = max_consecutive | |
| # Check if domain name mimics popular brands (typosquatting detection) | |
| popular_brands = [ | |
| "google", | |
| "facebook", | |
| "paypal", | |
| "amazon", | |
| "microsoft", | |
| "apple", | |
| "netflix", | |
| "instagram", | |
| "twitter", | |
| "linkedin", | |
| "ebay", | |
| "yahoo", | |
| ] | |
| # Calculate minimum edit distance to popular brands | |
| def levenshtein_distance(s1, s2): | |
| if len(s1) < len(s2): | |
| return levenshtein_distance(s2, s1) | |
| if len(s2) == 0: | |
| return len(s1) | |
| previous_row = range(len(s2) + 1) | |
| for i, c1 in enumerate(s1): | |
| current_row = [i + 1] | |
| for j, c2 in enumerate(s2): | |
| insertions = previous_row[j + 1] + 1 | |
| deletions = current_row[j] + 1 | |
| substitutions = previous_row[j] + (c1 != c2) | |
| current_row.append(min(insertions, deletions, substitutions)) | |
| previous_row = current_row | |
| return previous_row[-1] | |
| min_brand_distance = min( | |
| (levenshtein_distance(main_domain, brand) for brand in popular_brands), | |
| default=10, | |
| ) | |
| features["min_brand_edit_distance"] = min_brand_distance | |
| features["is_brand_typosquat"] = 1 if 0 < min_brand_distance <= 2 else 0 | |
| else: | |
| features["domain_length"] = 0 | |
| features["domain_has_digits"] = 0 | |
| features["domain_digit_count"] = 0 | |
| features["domain_has_hyphen"] = 0 | |
| features["domain_hyphen_count"] = 0 | |
| features["domain_entropy"] = 0 | |
| features["domain_vowel_ratio"] = 0 | |
| features["domain_max_consecutive_chars"] = 0 | |
| features["min_brand_edit_distance"] = 10 | |
| features["is_brand_typosquat"] = 0 | |
| else: | |
| features["domain_length"] = 0 | |
| features["domain_has_digits"] = 0 | |
| features["domain_digit_count"] = 0 | |
| features["domain_has_hyphen"] = 0 | |
| features["domain_hyphen_count"] = 0 | |
| features["domain_entropy"] = 0 | |
| features["domain_vowel_ratio"] = 0 | |
| features["domain_max_consecutive_chars"] = 0 | |
| features["min_brand_edit_distance"] = 10 | |
| features["is_brand_typosquat"] = 0 | |
| # ========== END NEW FEATURES ========== | |
| # Character type counts | |
| features["num_numeric"] = len(re.findall(r"\d", url)) | |
| features["num_letters"] = len(re.findall(r"[a-zA-Z]", url)) | |
| features["num_uppercase"] = len(re.findall(r"[A-Z]", url)) | |
| features["num_lowercase"] = len(re.findall(r"[a-z]", url)) | |
| # Ratios (avoid division by zero) | |
| url_len = features["url_length"] if features["url_length"] > 0 else 1 | |
| features["digit_to_length_ratio"] = features["num_numeric"] / url_len | |
| features["letters_to_length_ratio"] = features["num_letters"] / url_len | |
| features["special_to_length_ratio"] = features["num_special_chars"] / url_len | |
| features["hyphens_to_length_ratio"] = features["num_hyphens"] / url_len | |
| features["dots_to_length_ratio"] = features["num_dots"] / url_len | |
| # Path and query length | |
| if parsed: | |
| features["path_length"] = len(parsed.path) if parsed.path else 0 | |
| features["query_length"] = len(parsed.query) if parsed.query else 0 | |
| features["hostname_length"] = len(parsed.netloc) if parsed.netloc else 0 | |
| # NEW: Path depth (number of directories) | |
| features["path_depth"] = ( | |
| len([p for p in parsed.path.split("/") if p]) if parsed.path else 0 | |
| ) | |
| # NEW: Number of parameters in query string | |
| features["num_query_params"] = ( | |
| len(parsed.query.split("&")) if parsed.query else 0 | |
| ) | |
| # NEW: Fragment presence | |
| features["has_fragment"] = 1 if parsed.fragment else 0 | |
| # NEW: Port presence and suspicious ports | |
| if parsed.port: | |
| features["has_custom_port"] = 1 | |
| # Common suspicious ports for phishing | |
| features["has_suspicious_port"] = ( | |
| 1 if parsed.port in [8080, 8888, 3000, 4443, 5000] else 0 | |
| ) | |
| else: | |
| features["has_custom_port"] = 0 | |
| features["has_suspicious_port"] = 0 | |
| else: | |
| features["path_length"] = 0 | |
| features["query_length"] = 0 | |
| features["hostname_length"] = 0 | |
| features["path_depth"] = 0 | |
| features["num_query_params"] = 0 | |
| features["has_fragment"] = 0 | |
| features["has_custom_port"] = 0 | |
| features["has_suspicious_port"] = 0 | |
| # Suspicious keywords (case-insensitive search) | |
| suspicious_keywords = [ | |
| "login", | |
| "verify", | |
| "update", | |
| "bank", | |
| "secure", | |
| "account", | |
| "free", | |
| "bonus", | |
| "click", | |
| "offer", | |
| "xml", | |
| "php", | |
| "exe", | |
| "zip", | |
| "wp", | |
| "paypal", | |
| "admin", | |
| "amp", | |
| "signin", | |
| "password", | |
| "confirm", | |
| "suspended", | |
| "billing", | |
| "unlock", | |
| "validate", | |
| "authenticate", | |
| ] | |
| url_lower = url.lower() | |
| features["num_suspicious_words"] = sum( | |
| 1 for kw in suspicious_keywords if kw in url_lower | |
| ) | |
| features["has_suspicious_words"] = 1 if features["num_suspicious_words"] > 0 else 0 | |
| # TLD checks (case-insensitive) | |
| popular_tlds = [".com", ".org", ".net", ".edu", ".gov"] | |
| features["has_popular_tld"] = ( | |
| 1 if any(tld in url_lower for tld in popular_tlds) else 0 | |
| ) | |
| # Protocol checks | |
| features["has_https"] = 1 if url.lower().startswith("https://") else 0 | |
| features["has_http"] = 1 if url.lower().startswith("http://") else 0 | |
| # IP address in URL (potential phishing indicator) | |
| features["has_ip_address"] = 1 if re.search(r"(\d{1,3}\.){3}\d{1,3}", url) else 0 | |
| # Shortened URL services | |
| shorteners = ["bit.ly", "tinyurl", "goo.gl", "t.co", "ow.ly", "is.gd", "buff.ly"] | |
| features["is_shortened"] = ( | |
| 1 if any(short in url_lower for short in shorteners) else 0 | |
| ) | |
| # NEW: Double slash in path (suspicious) | |
| features["has_double_slash"] = 1 if "//" in url.replace("://", "") else 0 | |
| # NEW: Prefix/suffix separator (e.g., paypal-secure.com) | |
| if parsed and parsed.netloc: | |
| features["has_prefix_suffix"] = 1 if "-" in parsed.netloc else 0 | |
| else: | |
| features["has_prefix_suffix"] = 0 | |
| return features | |
| # Load model and prepare scaler | |
| def load_model_and_scaler(): | |
| # Load the trained model | |
| model = joblib.load("rf_tuned_model.joblib") | |
| return model | |
| def check_url(url, model): | |
| # Extract features | |
| predict_features = extract_features(url) | |
| predict_features_df = pd.DataFrame([predict_features]) | |
| # Transform with scaler | |
| predict_features_scaled = predict_features_df | |
| # Make prediction | |
| prediction = model.predict(predict_features_scaled) | |
| # Get prediction probabilities | |
| prediction_proba = model.predict_proba(predict_features_scaled) | |
| return prediction[0], prediction_proba[0] | |
| def main(): | |
| st.set_page_config( | |
| page_title="Phishing URL Detector", page_icon="π", layout="centered" | |
| ) | |
| st.title("π Phishing URL Detector") | |
| st.write( | |
| "Enter a URL below to check if it's legitimate or potentially a phishing attempt." | |
| ) | |
| # Load model and scaler | |
| try: | |
| model = load_model_and_scaler() | |
| except Exception as e: | |
| st.error(f"Error loading model: {e}") | |
| return | |
| # URL input | |
| url = st.text_input("Enter a URL to check:", placeholder="https://example.com") | |
| # Use session state URL if set | |
| if "url" in st.session_state and st.session_state.url: | |
| url = st.session_state.url | |
| # Check button | |
| if st.button("Check URL", type="primary"): | |
| if url: | |
| with st.spinner("Analyzing URL..."): | |
| prediction, prediction_proba = check_url(url, model) | |
| st.divider() | |
| # Display result | |
| if prediction == 1: | |
| st.error("π¨ **Warning! This looks like a Phishing URL.**") | |
| st.write( | |
| "This URL exhibits characteristics commonly found in phishing attempts. Please be cautious!" | |
| ) | |
| # Display confidence for phishing | |
| phishing_confidence = prediction_proba[1] * 100 | |
| st.metric("Phishing Confidence", f"{phishing_confidence:.2f}%") | |
| else: | |
| st.success("β **Safe! This seems like a Legitimate URL.**") | |
| st.write("This URL appears to be legitimate based on our analysis.") | |
| # Display confidence for legitimate | |
| legitimate_confidence = prediction_proba[0] * 100 | |
| st.metric("Legitimate Confidence", f"{legitimate_confidence:.2f}%") | |
| # Show probability breakdown | |
| st.divider() | |
| st.subheader("π Prediction Probabilities") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric("Legitimate", f"{prediction_proba[0] * 100:.2f}%") | |
| with col2: | |
| st.metric("Phishing", f"{prediction_proba[1] * 100:.2f}%") | |
| # Visual progress bars for probabilities | |
| st.write("**Probability Distribution:**") | |
| st.progress( | |
| prediction_proba[0], | |
| text=f"Legitimate: {prediction_proba[0] * 100:.2f}%", | |
| ) | |
| st.progress( | |
| prediction_proba[1], | |
| text=f"Phishing: {prediction_proba[1] * 100:.2f}%", | |
| ) | |
| # Show some extracted features | |
| with st.expander("View URL Analysis Details"): | |
| features = extract_features(url) | |
| # Basic Metrics | |
| st.subheader("π Basic Metrics") | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("URL Length", features["url_length"]) | |
| st.metric("Dots", features["num_dots"]) | |
| with col2: | |
| st.metric("Hyphens", features["num_hyphens"]) | |
| st.metric("Slashes", features["num_slash"]) | |
| with col3: | |
| st.metric("@ Symbols", features["num_at"]) | |
| st.metric("Subdomains", features["num_subdomains"]) | |
| with col4: | |
| st.metric("Special Chars", features["num_special_chars"]) | |
| st.metric("Numeric Chars", features["num_numeric"]) | |
| st.divider() | |
| # Security Indicators | |
| st.subheader("π Security Indicators") | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric( | |
| "HTTPS", "β Yes" if features["has_https"] else "β No" | |
| ) | |
| with col2: | |
| st.metric( | |
| "Popular TLD", | |
| "β Yes" if features["has_popular_tld"] else "β No", | |
| ) | |
| with col3: | |
| st.metric( | |
| "IP Address", | |
| "β οΈ Yes" if features["has_ip_address"] else "β No", | |
| ) | |
| with col4: | |
| st.metric( | |
| "URL Shortener", | |
| "β οΈ Yes" if features["is_shortened"] else "β No", | |
| ) | |
| st.divider() | |
| # Suspicious Indicators | |
| st.subheader("β οΈ Suspicious Indicators") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Suspicious Words", features["num_suspicious_words"]) | |
| with col2: | |
| st.metric("Domain Entropy", f"{features['domain_entropy']:.2f}") | |
| with col3: | |
| st.metric("Has WWW", "Yes" if features["has_www"] else "No") | |
| st.divider() | |
| # Length Breakdown | |
| st.subheader("π Component Lengths") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Hostname Length", features["hostname_length"]) | |
| with col2: | |
| st.metric("Path Length", features["path_length"]) | |
| with col3: | |
| st.metric("Query Length", features["query_length"]) | |
| st.divider() | |
| # Ratios | |
| st.subheader("π Feature Ratios") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric( | |
| "Digit Ratio", f"{features['digit_to_length_ratio']:.3f}" | |
| ) | |
| st.metric( | |
| "Letter Ratio", f"{features['letters_to_length_ratio']:.3f}" | |
| ) | |
| with col2: | |
| st.metric( | |
| "Special Char Ratio", | |
| f"{features['special_to_length_ratio']:.3f}", | |
| ) | |
| st.metric( | |
| "Hyphen Ratio", f"{features['hyphens_to_length_ratio']:.3f}" | |
| ) | |
| with col3: | |
| st.metric( | |
| "Dot Ratio", f"{features['dots_to_length_ratio']:.3f}" | |
| ) | |
| else: | |
| st.warning("Please enter a URL to check.") | |
| # Footer | |
| st.divider() | |
| st.caption( | |
| "β οΈ This tool uses machine learning to detect potential phishing URLs. Always exercise caution when clicking on unfamiliar links." | |
| ) | |
| if __name__ == "__main__": | |
| main() | |