Spaces:
Sleeping
Sleeping
| # ============================================================ | |
| # PhishGuard v5 β HuggingFace Spaces Deployment | |
| # File : app.py | |
| # Port : 7860 (required by HuggingFace) | |
| # | |
| # Endpoints: | |
| # GET / β health check | |
| # POST /predict β predict if URL is phishing or legitimate | |
| # POST /predict/batch β predict multiple URLs at once | |
| # | |
| # Features: 48 hybrid features | |
| # 42 structural + 2 knowledge-based + 4 pure structural | |
| # | |
| # Author : Uzman Zahid | |
| # Dublin Business School β 2026 | |
| # ============================================================ | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| import joblib | |
| import json | |
| import re | |
| import math | |
| import pandas as pd | |
| import os | |
| # ββ 1. INITIALISE APP ββββββββββββββββββββββββββββββββββββββββ | |
| app = FastAPI( | |
| title = "PhishGuard v5 API", | |
| description = "Real-time phishing URL detection using Machine Learning β 48 hybrid features β Uzman Zahid, Dublin Business School 2026", | |
| version = "5.0.0" | |
| ) | |
| # ββ 2. CORS βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins = ["*"], | |
| allow_credentials = True, | |
| allow_methods = ["*"], | |
| allow_headers = ["*"], | |
| ) | |
| # ββ 3. LOAD MODEL ββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Loading PhishGuard v5 model...") | |
| model_path = "model_compressed.pkl" if os.path.exists("model_compressed.pkl") else "model.pkl" | |
| try: | |
| model = joblib.load(model_path) | |
| print(f"β Model loaded from {model_path}") | |
| except Exception as e: | |
| print(f"β Error loading model: {e}") | |
| raise | |
| try: | |
| with open("features.json") as f: | |
| features = json.load(f) | |
| print(f"β Features loaded: {len(features)} features") | |
| except Exception as e: | |
| print(f"β Error loading features: {e}") | |
| raise | |
| # ββ 4. KNOWLEDGE LOOKUP TABLES βββββββββββββββββββββββββββββββ | |
| # Based on APWG Phishing Activity Trends Reports (2016-2024) | |
| SUSPICIOUS_TLDS = { | |
| 'tk', 'ml', 'ga', 'cf', 'gq', | |
| 'xyz', 'top', 'club', 'online', 'site', | |
| 'fun', 'icu', 'vip', 'cyou', 'lat', | |
| 'space', 'live', 'pw', 'cc', 'su', | |
| 'ws', 'bz', 'name', 'mobi', 'link', | |
| 'click', 'download', 'loan', 'win', | |
| 'racing', 'stream', 'trade', 'review', | |
| 'accountant', 'science', 'work', 'party', | |
| 'faith', 'date', 'cricket', 'ninja', | |
| 'bid', 'webcam', 'rocks', 'country', | |
| } | |
| # Well-established legitimate second-level domains | |
| KNOWN_SAFE_SLDS = { | |
| 'google', 'microsoft', 'apple', 'amazon', | |
| 'facebook', 'youtube', 'netflix', 'github', | |
| 'stackoverflow', 'wikipedia', 'twitter', | |
| 'linkedin', 'instagram', 'spotify', 'stripe', | |
| 'paypal', 'dropbox', 'slack', 'zoom', | |
| 'cloudflare', 'netlify', 'vercel', 'heroku', | |
| 'reddit', 'pinterest', 'tiktok', 'discord', | |
| 'notion', 'figma', 'canva', 'shopify', | |
| 'wordpress', 'adobe', 'salesforce', 'hubspot', | |
| 'openai', 'anthropic', 'huggingface', 'kaggle', | |
| 'coursera', 'udemy', 'edx', 'khanacademy', | |
| 'duolingo', 'bbc', 'cnn', 'reuters', 'bloomberg', | |
| 'techcrunch', 'mit', 'stanford', 'harvard', | |
| 'oxford', 'cambridge', 'dbs', 'ucd', 'tcd', 'dcu', | |
| 'npmjs', 'pypi', 'kubernetes', 'mozilla', 'docker', | |
| 'spinbot', 'ilovepdf', 'smallpdf', 'mp3cut', | |
| 'convertio', 'tinypng', 'grammarly', | |
| 'wise', 'revolut', 'coinbase', 'binance', | |
| 'vimeo', 'twitch', 'dailymotion', 'soundcloud', | |
| 'gov', 'nasa', 'digitalocean', | |
| } | |
| # ββ 5. REQUEST / RESPONSE MODELS βββββββββββββββββββββββββββββ | |
| class URLRequest(BaseModel): | |
| url: str | |
| class PredictionResponse(BaseModel): | |
| url : str | |
| prediction : int | |
| label : str | |
| confidence : float | |
| message : str | |
| # ββ 6. FEATURE EXTRACTION (48 HYBRID FEATURES) βββββββββββββββ | |
| def extract_features(raw_url): | |
| """ | |
| Extracts 48 hybrid features from any URL format. | |
| 42 structural + 2 knowledge-based + 4 pure structural. | |
| Identical to feature extraction used during training. | |
| """ | |
| try: | |
| url = str(raw_url).strip() | |
| # ββ Scheme ββββββββββββββββββββββββββββββββββββββββββββ | |
| has_https = 1 if url.lower().startswith('https://') else 0 | |
| has_http = 1 if url.lower().startswith('http://') else 0 | |
| # ββ Parse components ββββββββββββββββββββββββββββββββββ | |
| url_no_scheme = url | |
| if '://' in url: | |
| url_no_scheme = url.split('://', 1)[1] | |
| domain_with_port = url_no_scheme.split('/')[0].split('?')[0].split('#')[0] | |
| domain_clean = domain_with_port.split(':')[0] | |
| parts = url_no_scheme.split('/', 1) | |
| path = '/' + parts[1] if len(parts) > 1 else '' | |
| query = url.split('?', 1)[1].split('#')[0] if '?' in url else '' | |
| fragment = url.split('#', 1)[1] if '#' in url else '' | |
| domain_no_www = domain_clean | |
| if domain_no_www.lower().startswith('www.'): | |
| domain_no_www = domain_no_www[4:] | |
| if domain_clean.lower().startswith('www.'): | |
| domain_normalised = domain_clean | |
| else: | |
| domain_normalised = 'www.' + domain_clean | |
| domain_parts_full = domain_normalised.split('.') | |
| # ββ TLD and SLD βββββββββββββββββββββββββββββββββββββββ | |
| domain_parts = domain_no_www.split('.') | |
| tld = domain_parts[-1].lower() if domain_parts else '' | |
| sld = domain_parts[-2].lower() if len(domain_parts) >= 2 else '' | |
| if len(domain_parts) >= 3 and domain_parts[-1] in [ | |
| 'uk','au','in','jp','nz','za','br','sg','ie','pk' | |
| ]: | |
| tld = f"{domain_parts[-2]}.{domain_parts[-1]}" | |
| sld = domain_parts[-3].lower() if len(domain_parts) >= 3 else '' | |
| # ββ URL features ββββββββββββββββββββββββββββββββββββββ | |
| url_length = len(url) | |
| number_of_dots_in_url = url.count('.') | |
| digits_in_url = re.findall(r'\d', url) | |
| having_repeated_digits_in_url = 1 if len(digits_in_url) != len( | |
| set(digits_in_url)) and len(digits_in_url) > 0 else 0 | |
| number_of_digits_in_url = sum(c.isdigit() for c in url) | |
| number_of_special_char_in_url = sum(not c.isalnum() for c in url) | |
| number_of_hyphens_in_url = url.count('-') | |
| number_of_underline_in_url = url.count('_') | |
| number_of_slash_in_url = url.count('/') | |
| number_of_questionmark_in_url = url.count('?') | |
| number_of_equal_in_url = url.count('=') | |
| number_of_at_in_url = url.count('@') | |
| number_of_dollar_in_url = url.count('$') | |
| number_of_exclamation_in_url = url.count('!') | |
| number_of_hashtag_in_url = url.count('#') | |
| number_of_percent_in_url = url.count('%') | |
| # ββ Domain features βββββββββββββββββββββββββββββββββββ | |
| domain_length = len(domain_no_www) | |
| number_of_dots_in_domain = domain_no_www.count('.') | |
| number_of_hyphens_in_domain = domain_no_www.count('-') | |
| having_special_characters_in_domain = 1 if re.search( | |
| r'[^a-zA-Z0-9\.\-]', domain_no_www) else 0 | |
| number_of_special_characters_in_domain = sum( | |
| not c.isalnum() and c not in '.-' for c in domain_no_www) | |
| having_digits_in_domain = 1 if any( | |
| c.isdigit() for c in domain_no_www) else 0 | |
| number_of_digits_in_domain = sum( | |
| c.isdigit() for c in domain_no_www) | |
| digits_in_domain = re.findall(r'\d', domain_no_www) | |
| having_repeated_digits_in_domain = 1 if len( | |
| digits_in_domain) != len(set(digits_in_domain)) and \ | |
| len(digits_in_domain) > 0 else 0 | |
| # ββ Subdomain features ββββββββββββββββββββββββββββββββ | |
| number_of_subdomains = max(0, len(domain_parts_full) - 2) | |
| subdomains = domain_parts_full[:-2] if len( | |
| domain_parts_full) > 2 else [] | |
| subdomain_depth = len(subdomains) | |
| having_hyphen_in_subdomain = 1 if any( | |
| '-' in s for s in subdomains) else 0 | |
| average_subdomain_length = sum( | |
| len(s) for s in subdomains) / len(subdomains) \ | |
| if subdomains else 0.0 | |
| average_number_of_hyphens_in_subdomain = sum( | |
| s.count('-') for s in subdomains) / len(subdomains) \ | |
| if subdomains else 0.0 | |
| having_special_characters_in_subdomain = 1 if any( | |
| re.search(r'[^a-zA-Z0-9\-]', s) | |
| for s in subdomains) else 0 | |
| number_of_special_characters_in_subdomain = sum( | |
| sum(not c.isalnum() and c != '-' for c in s) | |
| for s in subdomains) | |
| having_digits_in_subdomain = 1 if any( | |
| any(c.isdigit() for c in s) for s in subdomains) else 0 | |
| number_of_digits_in_subdomain = sum( | |
| sum(c.isdigit() for c in s) for s in subdomains) | |
| all_sub_digits = re.findall( | |
| r'\d', ''.join(subdomains)) | |
| having_repeated_digits_in_subdomain = 1 if len( | |
| all_sub_digits) != len(set(all_sub_digits)) and \ | |
| len(all_sub_digits) > 0 else 0 | |
| # ββ Path/Query features βββββββββββββββββββββββββββββββ | |
| having_path = 1 if len(path) > 1 else 0 | |
| path_segments = [p for p in path.split('/') if p] | |
| path_length = len(path_segments) | |
| having_query = 1 if len(query) > 0 else 0 | |
| having_fragment = 1 if len(fragment) > 0 else 0 | |
| having_anchor = 1 if '#' in url else 0 | |
| # ββ Entropy features ββββββββββββββββββββββββββββββββββ | |
| if len(url) > 0: | |
| prob_url = [url.count(c)/len(url) for c in set(url)] | |
| entropy_of_url = -sum(p*math.log2(p) for p in prob_url if p > 0) | |
| else: | |
| entropy_of_url = 0.0 | |
| if len(domain_no_www) > 0: | |
| prob_dom = [domain_no_www.count(c)/len(domain_no_www) | |
| for c in set(domain_no_www)] | |
| entropy_of_domain = -sum(p*math.log2(p) for p in prob_dom if p > 0) | |
| else: | |
| entropy_of_domain = 0.0 | |
| # ββ NEW FEATURE 43: has_suspicious_tld ββββββββββββββββ | |
| # Knowledge-based: APWG documented high-abuse TLDs | |
| has_suspicious_tld = 1 if tld.lower() in SUSPICIOUS_TLDS else 0 | |
| # ββ NEW FEATURE 44: is_known_safe_sld βββββββββββββββββ | |
| # Knowledge-based: established legitimate platforms | |
| is_known_safe_sld = 1 if sld.lower() in KNOWN_SAFE_SLDS else 0 | |
| # ββ NEW FEATURE 45: consonant_vowel_ratio βββββββββββββ | |
| # Pure structural: unnatural domains = phishing signal | |
| vowels = set('aeiouAEIOU') | |
| letters = [c for c in domain_no_www if c.isalpha()] | |
| vowel_count = sum(1 for c in letters if c in vowels) | |
| consonant_count = sum(1 for c in letters if c not in vowels) | |
| consonant_vowel_ratio = round( | |
| consonant_count / (vowel_count + 1), 4) | |
| # ββ NEW FEATURE 46: longest_digit_sequence ββββββββββββ | |
| # Pure structural: digit runs indicate random generation | |
| digit_sequences = re.findall(r'\d+', domain_no_www) | |
| longest_digit_seq = max( | |
| (len(s) for s in digit_sequences), default=0) | |
| # ββ NEW FEATURE 47: digit_letter_ratio ββββββββββββββββ | |
| # Pure structural: digit-heavy domains = phishing | |
| alpha_count = sum(c.isalpha() for c in domain_no_www) | |
| digit_count = sum(c.isdigit() for c in domain_no_www) | |
| digit_letter_ratio = round( | |
| digit_count / (alpha_count + 1), 4) | |
| # ββ NEW FEATURE 48: path_to_url_ratio βββββββββββββββββ | |
| # Pure structural: bare phishing domains have ratio = 0 | |
| path_to_url_ratio = round( | |
| len(path) / len(url), 4) if len(url) > 0 else 0.0 | |
| return { | |
| 'has_https' : has_https, | |
| 'has_http' : has_http, | |
| 'url_length' : url_length, | |
| 'number_of_dots_in_url' : number_of_dots_in_url, | |
| 'having_repeated_digits_in_url' : having_repeated_digits_in_url, | |
| 'number_of_digits_in_url' : number_of_digits_in_url, | |
| 'number_of_special_char_in_url' : number_of_special_char_in_url, | |
| 'number_of_hyphens_in_url' : number_of_hyphens_in_url, | |
| 'number_of_underline_in_url' : number_of_underline_in_url, | |
| 'number_of_slash_in_url' : number_of_slash_in_url, | |
| 'number_of_questionmark_in_url' : number_of_questionmark_in_url, | |
| 'number_of_equal_in_url' : number_of_equal_in_url, | |
| 'number_of_at_in_url' : number_of_at_in_url, | |
| 'number_of_dollar_in_url' : number_of_dollar_in_url, | |
| 'number_of_exclamation_in_url' : number_of_exclamation_in_url, | |
| 'number_of_hashtag_in_url' : number_of_hashtag_in_url, | |
| 'number_of_percent_in_url' : number_of_percent_in_url, | |
| 'domain_length' : domain_length, | |
| 'number_of_dots_in_domain' : number_of_dots_in_domain, | |
| 'number_of_hyphens_in_domain' : number_of_hyphens_in_domain, | |
| 'having_special_characters_in_domain' : having_special_characters_in_domain, | |
| 'number_of_special_characters_in_domain' : number_of_special_characters_in_domain, | |
| 'having_digits_in_domain' : having_digits_in_domain, | |
| 'number_of_digits_in_domain' : number_of_digits_in_domain, | |
| 'having_repeated_digits_in_domain' : having_repeated_digits_in_domain, | |
| 'number_of_subdomains' : number_of_subdomains, | |
| 'subdomain_depth' : subdomain_depth, | |
| 'having_hyphen_in_subdomain' : having_hyphen_in_subdomain, | |
| 'average_subdomain_length' : average_subdomain_length, | |
| 'average_number_of_hyphens_in_subdomain' : average_number_of_hyphens_in_subdomain, | |
| 'having_special_characters_in_subdomain' : having_special_characters_in_subdomain, | |
| 'number_of_special_characters_in_subdomain': number_of_special_characters_in_subdomain, | |
| 'having_digits_in_subdomain' : having_digits_in_subdomain, | |
| 'number_of_digits_in_subdomain' : number_of_digits_in_subdomain, | |
| 'having_repeated_digits_in_subdomain' : having_repeated_digits_in_subdomain, | |
| 'having_path' : having_path, | |
| 'path_length' : path_length, | |
| 'having_query' : having_query, | |
| 'having_fragment' : having_fragment, | |
| 'having_anchor' : having_anchor, | |
| 'entropy_of_url' : entropy_of_url, | |
| 'entropy_of_domain' : entropy_of_domain, | |
| # New 6 hybrid features | |
| 'has_suspicious_tld' : has_suspicious_tld, | |
| 'is_known_safe_sld' : is_known_safe_sld, | |
| 'consonant_vowel_ratio' : consonant_vowel_ratio, | |
| 'longest_digit_sequence' : longest_digit_seq, | |
| 'digit_letter_ratio' : digit_letter_ratio, | |
| 'path_to_url_ratio' : path_to_url_ratio, | |
| } | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=400, | |
| detail=f"Feature extraction failed: {str(e)}" | |
| ) | |
| # ββ 7. ENDPOINTS βββββββββββββββββββββββββββββββββββββββββββββ | |
| def health_check(): | |
| return { | |
| "status" : "running", | |
| "model" : "PhishGuard v5", | |
| "features" : len(features), | |
| "version" : "5.0.0", | |
| "message" : "PhishGuard v5 API is live β 48 hybrid features!", | |
| "author" : "Uzman Zahid β Dublin Business School 2026", | |
| "docs" : "/docs" | |
| } | |
| def predict(request: URLRequest): | |
| """ | |
| Predicts whether a URL is phishing or legitimate. | |
| Returns prediction (0=legitimate, 1=phishing), | |
| label, confidence score, and message. | |
| """ | |
| url = request.url.strip() | |
| if not url: | |
| raise HTTPException( | |
| status_code=400, | |
| detail="URL cannot be empty" | |
| ) | |
| feat_dict = extract_features(url) | |
| X = pd.DataFrame([feat_dict])[features] | |
| prediction = int(model.predict(X)[0]) | |
| probability = model.predict_proba(X)[0] | |
| confidence = float(round(max(probability), 4)) | |
| label = "phishing" if prediction == 1 else "legitimate" | |
| message = ( | |
| f"β οΈ WARNING: This URL appears to be PHISHING! " | |
| f"({confidence*100:.1f}% confidence)" | |
| if prediction == 1 else | |
| f"β This URL appears to be LEGITIMATE. " | |
| f"({confidence*100:.1f}% confidence)" | |
| ) | |
| return PredictionResponse( | |
| url = url, | |
| prediction = prediction, | |
| label = label, | |
| confidence = confidence, | |
| message = message | |
| ) | |
| def predict_batch(urls: list[str]): | |
| """ | |
| Predicts multiple URLs at once. | |
| Accepts a list of URL strings. | |
| Returns array of predictions with labels and confidence. | |
| """ | |
| results = [] | |
| for url in urls: | |
| try: | |
| feat_dict = extract_features(url.strip()) | |
| X = pd.DataFrame([feat_dict])[features] | |
| prediction = int(model.predict(X)[0]) | |
| probability = model.predict_proba(X)[0] | |
| confidence = float(round(max(probability), 4)) | |
| results.append({ | |
| "url" : url, | |
| "prediction" : prediction, | |
| "label" : "phishing" if prediction == 1 else "legitimate", | |
| "confidence" : confidence | |
| }) | |
| except Exception as e: | |
| results.append({ | |
| "url" : url, | |
| "error": str(e) | |
| }) | |
| return { | |
| "results": results, | |
| "total" : len(results), | |
| "model" : "PhishGuard v5" | |
| } |