# ============================================================ # PhishGuard v5 — HuggingFace Spaces Deployment # File : app.py # Port : 7860 (required by HuggingFace) # # Endpoints: # GET / → health check # POST /predict → predict if URL is phishing or legitimate # POST /predict/batch → predict multiple URLs at once # # Features: 48 hybrid features # 42 structural + 2 knowledge-based + 4 pure structural # # Author : Uzman Zahid # Dublin Business School — 2026 # ============================================================ from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel import joblib import json import re import math import pandas as pd import os # ── 1. INITIALISE APP ──────────────────────────────────────── app = FastAPI( title = "PhishGuard v5 API", description = "Real-time phishing URL detection using Machine Learning — 48 hybrid features — Uzman Zahid, Dublin Business School 2026", version = "5.0.0" ) # ── 2. CORS ─────────────────────────────────────────────────── app.add_middleware( CORSMiddleware, allow_origins = ["*"], allow_credentials = True, allow_methods = ["*"], allow_headers = ["*"], ) # ── 3. LOAD MODEL ──────────────────────────────────────────── print("Loading PhishGuard v5 model...") model_path = "model_compressed.pkl" if os.path.exists("model_compressed.pkl") else "model.pkl" try: model = joblib.load(model_path) print(f"✅ Model loaded from {model_path}") except Exception as e: print(f"❌ Error loading model: {e}") raise try: with open("features.json") as f: features = json.load(f) print(f"✅ Features loaded: {len(features)} features") except Exception as e: print(f"❌ Error loading features: {e}") raise # ── 4. KNOWLEDGE LOOKUP TABLES ─────────────────────────────── # Based on APWG Phishing Activity Trends Reports (2016-2024) SUSPICIOUS_TLDS = { 'tk', 'ml', 'ga', 'cf', 'gq', 'xyz', 'top', 'club', 'online', 'site', 'fun', 'icu', 'vip', 'cyou', 'lat', 'space', 'live', 'pw', 'cc', 'su', 'ws', 'bz', 'name', 'mobi', 'link', 'click', 'download', 'loan', 'win', 'racing', 'stream', 'trade', 'review', 'accountant', 'science', 'work', 'party', 'faith', 'date', 'cricket', 'ninja', 'bid', 'webcam', 'rocks', 'country', } # Well-established legitimate second-level domains KNOWN_SAFE_SLDS = { 'google', 'microsoft', 'apple', 'amazon', 'facebook', 'youtube', 'netflix', 'github', 'stackoverflow', 'wikipedia', 'twitter', 'linkedin', 'instagram', 'spotify', 'stripe', 'paypal', 'dropbox', 'slack', 'zoom', 'cloudflare', 'netlify', 'vercel', 'heroku', 'reddit', 'pinterest', 'tiktok', 'discord', 'notion', 'figma', 'canva', 'shopify', 'wordpress', 'adobe', 'salesforce', 'hubspot', 'openai', 'anthropic', 'huggingface', 'kaggle', 'coursera', 'udemy', 'edx', 'khanacademy', 'duolingo', 'bbc', 'cnn', 'reuters', 'bloomberg', 'techcrunch', 'mit', 'stanford', 'harvard', 'oxford', 'cambridge', 'dbs', 'ucd', 'tcd', 'dcu', 'npmjs', 'pypi', 'kubernetes', 'mozilla', 'docker', 'spinbot', 'ilovepdf', 'smallpdf', 'mp3cut', 'convertio', 'tinypng', 'grammarly', 'wise', 'revolut', 'coinbase', 'binance', 'vimeo', 'twitch', 'dailymotion', 'soundcloud', 'gov', 'nasa', 'digitalocean', } # ── 5. REQUEST / RESPONSE MODELS ───────────────────────────── class URLRequest(BaseModel): url: str class PredictionResponse(BaseModel): url : str prediction : int label : str confidence : float message : str # ── 6. FEATURE EXTRACTION (48 HYBRID FEATURES) ─────────────── def extract_features(raw_url): """ Extracts 48 hybrid features from any URL format. 42 structural + 2 knowledge-based + 4 pure structural. Identical to feature extraction used during training. """ try: url = str(raw_url).strip() # ── Scheme ──────────────────────────────────────────── has_https = 1 if url.lower().startswith('https://') else 0 has_http = 1 if url.lower().startswith('http://') else 0 # ── Parse components ────────────────────────────────── url_no_scheme = url if '://' in url: url_no_scheme = url.split('://', 1)[1] domain_with_port = url_no_scheme.split('/')[0].split('?')[0].split('#')[0] domain_clean = domain_with_port.split(':')[0] parts = url_no_scheme.split('/', 1) path = '/' + parts[1] if len(parts) > 1 else '' query = url.split('?', 1)[1].split('#')[0] if '?' in url else '' fragment = url.split('#', 1)[1] if '#' in url else '' domain_no_www = domain_clean if domain_no_www.lower().startswith('www.'): domain_no_www = domain_no_www[4:] if domain_clean.lower().startswith('www.'): domain_normalised = domain_clean else: domain_normalised = 'www.' + domain_clean domain_parts_full = domain_normalised.split('.') # ── TLD and SLD ─────────────────────────────────────── domain_parts = domain_no_www.split('.') tld = domain_parts[-1].lower() if domain_parts else '' sld = domain_parts[-2].lower() if len(domain_parts) >= 2 else '' if len(domain_parts) >= 3 and domain_parts[-1] in [ 'uk','au','in','jp','nz','za','br','sg','ie','pk' ]: tld = f"{domain_parts[-2]}.{domain_parts[-1]}" sld = domain_parts[-3].lower() if len(domain_parts) >= 3 else '' # ── URL features ────────────────────────────────────── url_length = len(url) number_of_dots_in_url = url.count('.') digits_in_url = re.findall(r'\d', url) having_repeated_digits_in_url = 1 if len(digits_in_url) != len( set(digits_in_url)) and len(digits_in_url) > 0 else 0 number_of_digits_in_url = sum(c.isdigit() for c in url) number_of_special_char_in_url = sum(not c.isalnum() for c in url) number_of_hyphens_in_url = url.count('-') number_of_underline_in_url = url.count('_') number_of_slash_in_url = url.count('/') number_of_questionmark_in_url = url.count('?') number_of_equal_in_url = url.count('=') number_of_at_in_url = url.count('@') number_of_dollar_in_url = url.count('$') number_of_exclamation_in_url = url.count('!') number_of_hashtag_in_url = url.count('#') number_of_percent_in_url = url.count('%') # ── Domain features ─────────────────────────────────── domain_length = len(domain_no_www) number_of_dots_in_domain = domain_no_www.count('.') number_of_hyphens_in_domain = domain_no_www.count('-') having_special_characters_in_domain = 1 if re.search( r'[^a-zA-Z0-9\.\-]', domain_no_www) else 0 number_of_special_characters_in_domain = sum( not c.isalnum() and c not in '.-' for c in domain_no_www) having_digits_in_domain = 1 if any( c.isdigit() for c in domain_no_www) else 0 number_of_digits_in_domain = sum( c.isdigit() for c in domain_no_www) digits_in_domain = re.findall(r'\d', domain_no_www) having_repeated_digits_in_domain = 1 if len( digits_in_domain) != len(set(digits_in_domain)) and \ len(digits_in_domain) > 0 else 0 # ── Subdomain features ──────────────────────────────── number_of_subdomains = max(0, len(domain_parts_full) - 2) subdomains = domain_parts_full[:-2] if len( domain_parts_full) > 2 else [] subdomain_depth = len(subdomains) having_hyphen_in_subdomain = 1 if any( '-' in s for s in subdomains) else 0 average_subdomain_length = sum( len(s) for s in subdomains) / len(subdomains) \ if subdomains else 0.0 average_number_of_hyphens_in_subdomain = sum( s.count('-') for s in subdomains) / len(subdomains) \ if subdomains else 0.0 having_special_characters_in_subdomain = 1 if any( re.search(r'[^a-zA-Z0-9\-]', s) for s in subdomains) else 0 number_of_special_characters_in_subdomain = sum( sum(not c.isalnum() and c != '-' for c in s) for s in subdomains) having_digits_in_subdomain = 1 if any( any(c.isdigit() for c in s) for s in subdomains) else 0 number_of_digits_in_subdomain = sum( sum(c.isdigit() for c in s) for s in subdomains) all_sub_digits = re.findall( r'\d', ''.join(subdomains)) having_repeated_digits_in_subdomain = 1 if len( all_sub_digits) != len(set(all_sub_digits)) and \ len(all_sub_digits) > 0 else 0 # ── Path/Query features ─────────────────────────────── having_path = 1 if len(path) > 1 else 0 path_segments = [p for p in path.split('/') if p] path_length = len(path_segments) having_query = 1 if len(query) > 0 else 0 having_fragment = 1 if len(fragment) > 0 else 0 having_anchor = 1 if '#' in url else 0 # ── Entropy features ────────────────────────────────── if len(url) > 0: prob_url = [url.count(c)/len(url) for c in set(url)] entropy_of_url = -sum(p*math.log2(p) for p in prob_url if p > 0) else: entropy_of_url = 0.0 if len(domain_no_www) > 0: prob_dom = [domain_no_www.count(c)/len(domain_no_www) for c in set(domain_no_www)] entropy_of_domain = -sum(p*math.log2(p) for p in prob_dom if p > 0) else: entropy_of_domain = 0.0 # ── NEW FEATURE 43: has_suspicious_tld ──────────────── # Knowledge-based: APWG documented high-abuse TLDs has_suspicious_tld = 1 if tld.lower() in SUSPICIOUS_TLDS else 0 # ── NEW FEATURE 44: is_known_safe_sld ───────────────── # Knowledge-based: established legitimate platforms is_known_safe_sld = 1 if sld.lower() in KNOWN_SAFE_SLDS else 0 # ── NEW FEATURE 45: consonant_vowel_ratio ───────────── # Pure structural: unnatural domains = phishing signal vowels = set('aeiouAEIOU') letters = [c for c in domain_no_www if c.isalpha()] vowel_count = sum(1 for c in letters if c in vowels) consonant_count = sum(1 for c in letters if c not in vowels) consonant_vowel_ratio = round( consonant_count / (vowel_count + 1), 4) # ── NEW FEATURE 46: longest_digit_sequence ──────────── # Pure structural: digit runs indicate random generation digit_sequences = re.findall(r'\d+', domain_no_www) longest_digit_seq = max( (len(s) for s in digit_sequences), default=0) # ── NEW FEATURE 47: digit_letter_ratio ──────────────── # Pure structural: digit-heavy domains = phishing alpha_count = sum(c.isalpha() for c in domain_no_www) digit_count = sum(c.isdigit() for c in domain_no_www) digit_letter_ratio = round( digit_count / (alpha_count + 1), 4) # ── NEW FEATURE 48: path_to_url_ratio ───────────────── # Pure structural: bare phishing domains have ratio = 0 path_to_url_ratio = round( len(path) / len(url), 4) if len(url) > 0 else 0.0 return { 'has_https' : has_https, 'has_http' : has_http, 'url_length' : url_length, 'number_of_dots_in_url' : number_of_dots_in_url, 'having_repeated_digits_in_url' : having_repeated_digits_in_url, 'number_of_digits_in_url' : number_of_digits_in_url, 'number_of_special_char_in_url' : number_of_special_char_in_url, 'number_of_hyphens_in_url' : number_of_hyphens_in_url, 'number_of_underline_in_url' : number_of_underline_in_url, 'number_of_slash_in_url' : number_of_slash_in_url, 'number_of_questionmark_in_url' : number_of_questionmark_in_url, 'number_of_equal_in_url' : number_of_equal_in_url, 'number_of_at_in_url' : number_of_at_in_url, 'number_of_dollar_in_url' : number_of_dollar_in_url, 'number_of_exclamation_in_url' : number_of_exclamation_in_url, 'number_of_hashtag_in_url' : number_of_hashtag_in_url, 'number_of_percent_in_url' : number_of_percent_in_url, 'domain_length' : domain_length, 'number_of_dots_in_domain' : number_of_dots_in_domain, 'number_of_hyphens_in_domain' : number_of_hyphens_in_domain, 'having_special_characters_in_domain' : having_special_characters_in_domain, 'number_of_special_characters_in_domain' : number_of_special_characters_in_domain, 'having_digits_in_domain' : having_digits_in_domain, 'number_of_digits_in_domain' : number_of_digits_in_domain, 'having_repeated_digits_in_domain' : having_repeated_digits_in_domain, 'number_of_subdomains' : number_of_subdomains, 'subdomain_depth' : subdomain_depth, 'having_hyphen_in_subdomain' : having_hyphen_in_subdomain, 'average_subdomain_length' : average_subdomain_length, 'average_number_of_hyphens_in_subdomain' : average_number_of_hyphens_in_subdomain, 'having_special_characters_in_subdomain' : having_special_characters_in_subdomain, 'number_of_special_characters_in_subdomain': number_of_special_characters_in_subdomain, 'having_digits_in_subdomain' : having_digits_in_subdomain, 'number_of_digits_in_subdomain' : number_of_digits_in_subdomain, 'having_repeated_digits_in_subdomain' : having_repeated_digits_in_subdomain, 'having_path' : having_path, 'path_length' : path_length, 'having_query' : having_query, 'having_fragment' : having_fragment, 'having_anchor' : having_anchor, 'entropy_of_url' : entropy_of_url, 'entropy_of_domain' : entropy_of_domain, # New 6 hybrid features 'has_suspicious_tld' : has_suspicious_tld, 'is_known_safe_sld' : is_known_safe_sld, 'consonant_vowel_ratio' : consonant_vowel_ratio, 'longest_digit_sequence' : longest_digit_seq, 'digit_letter_ratio' : digit_letter_ratio, 'path_to_url_ratio' : path_to_url_ratio, } except Exception as e: raise HTTPException( status_code=400, detail=f"Feature extraction failed: {str(e)}" ) # ── 7. ENDPOINTS ───────────────────────────────────────────── @app.get("/") def health_check(): return { "status" : "running", "model" : "PhishGuard v5", "features" : len(features), "version" : "5.0.0", "message" : "PhishGuard v5 API is live — 48 hybrid features!", "author" : "Uzman Zahid — Dublin Business School 2026", "docs" : "/docs" } @app.post("/predict", response_model=PredictionResponse) def predict(request: URLRequest): """ Predicts whether a URL is phishing or legitimate. Returns prediction (0=legitimate, 1=phishing), label, confidence score, and message. """ url = request.url.strip() if not url: raise HTTPException( status_code=400, detail="URL cannot be empty" ) feat_dict = extract_features(url) X = pd.DataFrame([feat_dict])[features] prediction = int(model.predict(X)[0]) probability = model.predict_proba(X)[0] confidence = float(round(max(probability), 4)) label = "phishing" if prediction == 1 else "legitimate" message = ( f"⚠️ WARNING: This URL appears to be PHISHING! " f"({confidence*100:.1f}% confidence)" if prediction == 1 else f"✅ This URL appears to be LEGITIMATE. " f"({confidence*100:.1f}% confidence)" ) return PredictionResponse( url = url, prediction = prediction, label = label, confidence = confidence, message = message ) @app.post("/predict/batch") def predict_batch(urls: list[str]): """ Predicts multiple URLs at once. Accepts a list of URL strings. Returns array of predictions with labels and confidence. """ results = [] for url in urls: try: feat_dict = extract_features(url.strip()) X = pd.DataFrame([feat_dict])[features] prediction = int(model.predict(X)[0]) probability = model.predict_proba(X)[0] confidence = float(round(max(probability), 4)) results.append({ "url" : url, "prediction" : prediction, "label" : "phishing" if prediction == 1 else "legitimate", "confidence" : confidence }) except Exception as e: results.append({ "url" : url, "error": str(e) }) return { "results": results, "total" : len(results), "model" : "PhishGuard v5" }