Spaces:

Uzmann
/

Phish_Guard

Sleeping

App Files Files Community

Phish_Guard / app.py

Uzmann

new model trained

3ad73e3 19 days ago

raw

history blame contribute delete

20 kB

	# ============================================================
	# PhishGuard v5 — HuggingFace Spaces Deployment
	# File : app.py
	# Port : 7860 (required by HuggingFace)
	#
	# Endpoints:
	# GET / → health check
	# POST /predict → predict if URL is phishing or legitimate
	# POST /predict/batch → predict multiple URLs at once
	#
	# Features: 48 hybrid features
	# 42 structural + 2 knowledge-based + 4 pure structural
	#
	# Author : Uzman Zahid
	# Dublin Business School — 2026
	# ============================================================

	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	import joblib
	import json
	import re
	import math
	import pandas as pd
	import os

	# ── 1. INITIALISE APP ────────────────────────────────────────
	app = FastAPI(
	title = "PhishGuard v5 API",
	description = "Real-time phishing URL detection using Machine Learning — 48 hybrid features — Uzman Zahid, Dublin Business School 2026",
	version = "5.0.0"
	)

	# ── 2. CORS ───────────────────────────────────────────────────
	app.add_middleware(
	CORSMiddleware,
	allow_origins = ["*"],
	allow_credentials = True,
	allow_methods = ["*"],
	allow_headers = ["*"],
	)

	# ── 3. LOAD MODEL ────────────────────────────────────────────
	print("Loading PhishGuard v5 model...")
	model_path = "model_compressed.pkl" if os.path.exists("model_compressed.pkl") else "model.pkl"
	try:
	model = joblib.load(model_path)
	print(f"✅ Model loaded from {model_path}")
	except Exception as e:
	print(f"❌ Error loading model: {e}")
	raise

	try:
	with open("features.json") as f:
	features = json.load(f)
	print(f"✅ Features loaded: {len(features)} features")
	except Exception as e:
	print(f"❌ Error loading features: {e}")
	raise

	# ── 4. KNOWLEDGE LOOKUP TABLES ───────────────────────────────
	# Based on APWG Phishing Activity Trends Reports (2016-2024)
	SUSPICIOUS_TLDS = {
	'tk', 'ml', 'ga', 'cf', 'gq',
	'xyz', 'top', 'club', 'online', 'site',
	'fun', 'icu', 'vip', 'cyou', 'lat',
	'space', 'live', 'pw', 'cc', 'su',
	'ws', 'bz', 'name', 'mobi', 'link',
	'click', 'download', 'loan', 'win',
	'racing', 'stream', 'trade', 'review',
	'accountant', 'science', 'work', 'party',
	'faith', 'date', 'cricket', 'ninja',
	'bid', 'webcam', 'rocks', 'country',
	}

	# Well-established legitimate second-level domains
	KNOWN_SAFE_SLDS = {
	'google', 'microsoft', 'apple', 'amazon',
	'facebook', 'youtube', 'netflix', 'github',
	'stackoverflow', 'wikipedia', 'twitter',
	'linkedin', 'instagram', 'spotify', 'stripe',
	'paypal', 'dropbox', 'slack', 'zoom',
	'cloudflare', 'netlify', 'vercel', 'heroku',
	'reddit', 'pinterest', 'tiktok', 'discord',
	'notion', 'figma', 'canva', 'shopify',
	'wordpress', 'adobe', 'salesforce', 'hubspot',
	'openai', 'anthropic', 'huggingface', 'kaggle',
	'coursera', 'udemy', 'edx', 'khanacademy',
	'duolingo', 'bbc', 'cnn', 'reuters', 'bloomberg',
	'techcrunch', 'mit', 'stanford', 'harvard',
	'oxford', 'cambridge', 'dbs', 'ucd', 'tcd', 'dcu',
	'npmjs', 'pypi', 'kubernetes', 'mozilla', 'docker',
	'spinbot', 'ilovepdf', 'smallpdf', 'mp3cut',
	'convertio', 'tinypng', 'grammarly',
	'wise', 'revolut', 'coinbase', 'binance',
	'vimeo', 'twitch', 'dailymotion', 'soundcloud',
	'gov', 'nasa', 'digitalocean',
	}

	# ── 5. REQUEST / RESPONSE MODELS ─────────────────────────────
	class URLRequest(BaseModel):
	url: str

	class PredictionResponse(BaseModel):
	url : str
	prediction : int
	label : str
	confidence : float
	message : str

	# ── 6. FEATURE EXTRACTION (48 HYBRID FEATURES) ───────────────
	def extract_features(raw_url):
	"""
	Extracts 48 hybrid features from any URL format.
	42 structural + 2 knowledge-based + 4 pure structural.
	Identical to feature extraction used during training.
	"""
	try:
	url = str(raw_url).strip()

	# ── Scheme ────────────────────────────────────────────
	has_https = 1 if url.lower().startswith('https://') else 0
	has_http = 1 if url.lower().startswith('http://') else 0

	# ── Parse components ──────────────────────────────────
	url_no_scheme = url
	if '://' in url:
	url_no_scheme = url.split('://', 1)[1]

	domain_with_port = url_no_scheme.split('/')[0].split('?')[0].split('#')[0]
	domain_clean = domain_with_port.split(':')[0]
	parts = url_no_scheme.split('/', 1)
	path = '/' + parts[1] if len(parts) > 1 else ''
	query = url.split('?', 1)[1].split('#')[0] if '?' in url else ''
	fragment = url.split('#', 1)[1] if '#' in url else ''

	domain_no_www = domain_clean
	if domain_no_www.lower().startswith('www.'):
	domain_no_www = domain_no_www[4:]

	if domain_clean.lower().startswith('www.'):
	domain_normalised = domain_clean
	else:
	domain_normalised = 'www.' + domain_clean

	domain_parts_full = domain_normalised.split('.')

	# ── TLD and SLD ───────────────────────────────────────
	domain_parts = domain_no_www.split('.')
	tld = domain_parts[-1].lower() if domain_parts else ''
	sld = domain_parts[-2].lower() if len(domain_parts) >= 2 else ''
	if len(domain_parts) >= 3 and domain_parts[-1] in [
	'uk','au','in','jp','nz','za','br','sg','ie','pk'
	]:
	tld = f"{domain_parts[-2]}.{domain_parts[-1]}"
	sld = domain_parts[-3].lower() if len(domain_parts) >= 3 else ''

	# ── URL features ──────────────────────────────────────
	url_length = len(url)
	number_of_dots_in_url = url.count('.')
	digits_in_url = re.findall(r'\d', url)
	having_repeated_digits_in_url = 1 if len(digits_in_url) != len(
	set(digits_in_url)) and len(digits_in_url) > 0 else 0
	number_of_digits_in_url = sum(c.isdigit() for c in url)
	number_of_special_char_in_url = sum(not c.isalnum() for c in url)
	number_of_hyphens_in_url = url.count('-')
	number_of_underline_in_url = url.count('_')
	number_of_slash_in_url = url.count('/')
	number_of_questionmark_in_url = url.count('?')
	number_of_equal_in_url = url.count('=')
	number_of_at_in_url = url.count('@')
	number_of_dollar_in_url = url.count('$')
	number_of_exclamation_in_url = url.count('!')
	number_of_hashtag_in_url = url.count('#')
	number_of_percent_in_url = url.count('%')

	# ── Domain features ───────────────────────────────────
	domain_length = len(domain_no_www)
	number_of_dots_in_domain = domain_no_www.count('.')
	number_of_hyphens_in_domain = domain_no_www.count('-')
	having_special_characters_in_domain = 1 if re.search(
	r'[^a-zA-Z0-9\.\-]', domain_no_www) else 0
	number_of_special_characters_in_domain = sum(
	not c.isalnum() and c not in '.-' for c in domain_no_www)
	having_digits_in_domain = 1 if any(
	c.isdigit() for c in domain_no_www) else 0
	number_of_digits_in_domain = sum(
	c.isdigit() for c in domain_no_www)
	digits_in_domain = re.findall(r'\d', domain_no_www)
	having_repeated_digits_in_domain = 1 if len(
	digits_in_domain) != len(set(digits_in_domain)) and \
	len(digits_in_domain) > 0 else 0

	# ── Subdomain features ────────────────────────────────
	number_of_subdomains = max(0, len(domain_parts_full) - 2)
	subdomains = domain_parts_full[:-2] if len(
	domain_parts_full) > 2 else []
	subdomain_depth = len(subdomains)

	having_hyphen_in_subdomain = 1 if any(
	'-' in s for s in subdomains) else 0
	average_subdomain_length = sum(
	len(s) for s in subdomains) / len(subdomains) \
	if subdomains else 0.0
	average_number_of_hyphens_in_subdomain = sum(
	s.count('-') for s in subdomains) / len(subdomains) \
	if subdomains else 0.0
	having_special_characters_in_subdomain = 1 if any(
	re.search(r'[^a-zA-Z0-9\-]', s)
	for s in subdomains) else 0
	number_of_special_characters_in_subdomain = sum(
	sum(not c.isalnum() and c != '-' for c in s)
	for s in subdomains)
	having_digits_in_subdomain = 1 if any(
	any(c.isdigit() for c in s) for s in subdomains) else 0
	number_of_digits_in_subdomain = sum(
	sum(c.isdigit() for c in s) for s in subdomains)
	all_sub_digits = re.findall(
	r'\d', ''.join(subdomains))
	having_repeated_digits_in_subdomain = 1 if len(
	all_sub_digits) != len(set(all_sub_digits)) and \
	len(all_sub_digits) > 0 else 0

	# ── Path/Query features ───────────────────────────────
	having_path = 1 if len(path) > 1 else 0
	path_segments = [p for p in path.split('/') if p]
	path_length = len(path_segments)
	having_query = 1 if len(query) > 0 else 0
	having_fragment = 1 if len(fragment) > 0 else 0
	having_anchor = 1 if '#' in url else 0

	# ── Entropy features ──────────────────────────────────
	if len(url) > 0:
	prob_url = [url.count(c)/len(url) for c in set(url)]
	entropy_of_url = -sum(p*math.log2(p) for p in prob_url if p > 0)
	else:
	entropy_of_url = 0.0

	if len(domain_no_www) > 0:
	prob_dom = [domain_no_www.count(c)/len(domain_no_www)
	for c in set(domain_no_www)]
	entropy_of_domain = -sum(p*math.log2(p) for p in prob_dom if p > 0)
	else:
	entropy_of_domain = 0.0

	# ── NEW FEATURE 43: has_suspicious_tld ────────────────
	# Knowledge-based: APWG documented high-abuse TLDs
	has_suspicious_tld = 1 if tld.lower() in SUSPICIOUS_TLDS else 0

	# ── NEW FEATURE 44: is_known_safe_sld ─────────────────
	# Knowledge-based: established legitimate platforms
	is_known_safe_sld = 1 if sld.lower() in KNOWN_SAFE_SLDS else 0

	# ── NEW FEATURE 45: consonant_vowel_ratio ─────────────
	# Pure structural: unnatural domains = phishing signal
	vowels = set('aeiouAEIOU')
	letters = [c for c in domain_no_www if c.isalpha()]
	vowel_count = sum(1 for c in letters if c in vowels)
	consonant_count = sum(1 for c in letters if c not in vowels)
	consonant_vowel_ratio = round(
	consonant_count / (vowel_count + 1), 4)

	# ── NEW FEATURE 46: longest_digit_sequence ────────────
	# Pure structural: digit runs indicate random generation
	digit_sequences = re.findall(r'\d+', domain_no_www)
	longest_digit_seq = max(
	(len(s) for s in digit_sequences), default=0)

	# ── NEW FEATURE 47: digit_letter_ratio ────────────────
	# Pure structural: digit-heavy domains = phishing
	alpha_count = sum(c.isalpha() for c in domain_no_www)
	digit_count = sum(c.isdigit() for c in domain_no_www)
	digit_letter_ratio = round(
	digit_count / (alpha_count + 1), 4)

	# ── NEW FEATURE 48: path_to_url_ratio ─────────────────
	# Pure structural: bare phishing domains have ratio = 0
	path_to_url_ratio = round(
	len(path) / len(url), 4) if len(url) > 0 else 0.0

	return {
	'has_https' : has_https,
	'has_http' : has_http,
	'url_length' : url_length,
	'number_of_dots_in_url' : number_of_dots_in_url,
	'having_repeated_digits_in_url' : having_repeated_digits_in_url,
	'number_of_digits_in_url' : number_of_digits_in_url,
	'number_of_special_char_in_url' : number_of_special_char_in_url,
	'number_of_hyphens_in_url' : number_of_hyphens_in_url,
	'number_of_underline_in_url' : number_of_underline_in_url,
	'number_of_slash_in_url' : number_of_slash_in_url,
	'number_of_questionmark_in_url' : number_of_questionmark_in_url,
	'number_of_equal_in_url' : number_of_equal_in_url,
	'number_of_at_in_url' : number_of_at_in_url,
	'number_of_dollar_in_url' : number_of_dollar_in_url,
	'number_of_exclamation_in_url' : number_of_exclamation_in_url,
	'number_of_hashtag_in_url' : number_of_hashtag_in_url,
	'number_of_percent_in_url' : number_of_percent_in_url,
	'domain_length' : domain_length,
	'number_of_dots_in_domain' : number_of_dots_in_domain,
	'number_of_hyphens_in_domain' : number_of_hyphens_in_domain,
	'having_special_characters_in_domain' : having_special_characters_in_domain,
	'number_of_special_characters_in_domain' : number_of_special_characters_in_domain,
	'having_digits_in_domain' : having_digits_in_domain,
	'number_of_digits_in_domain' : number_of_digits_in_domain,
	'having_repeated_digits_in_domain' : having_repeated_digits_in_domain,
	'number_of_subdomains' : number_of_subdomains,
	'subdomain_depth' : subdomain_depth,
	'having_hyphen_in_subdomain' : having_hyphen_in_subdomain,
	'average_subdomain_length' : average_subdomain_length,
	'average_number_of_hyphens_in_subdomain' : average_number_of_hyphens_in_subdomain,
	'having_special_characters_in_subdomain' : having_special_characters_in_subdomain,
	'number_of_special_characters_in_subdomain': number_of_special_characters_in_subdomain,
	'having_digits_in_subdomain' : having_digits_in_subdomain,
	'number_of_digits_in_subdomain' : number_of_digits_in_subdomain,
	'having_repeated_digits_in_subdomain' : having_repeated_digits_in_subdomain,
	'having_path' : having_path,
	'path_length' : path_length,
	'having_query' : having_query,
	'having_fragment' : having_fragment,
	'having_anchor' : having_anchor,
	'entropy_of_url' : entropy_of_url,
	'entropy_of_domain' : entropy_of_domain,
	# New 6 hybrid features
	'has_suspicious_tld' : has_suspicious_tld,
	'is_known_safe_sld' : is_known_safe_sld,
	'consonant_vowel_ratio' : consonant_vowel_ratio,
	'longest_digit_sequence' : longest_digit_seq,
	'digit_letter_ratio' : digit_letter_ratio,
	'path_to_url_ratio' : path_to_url_ratio,
	}
	except Exception as e:
	raise HTTPException(
	status_code=400,
	detail=f"Feature extraction failed: {str(e)}"
	)

	# ── 7. ENDPOINTS ─────────────────────────────────────────────
	@app.get("/")
	def health_check():
	return {
	"status" : "running",
	"model" : "PhishGuard v5",
	"features" : len(features),
	"version" : "5.0.0",
	"message" : "PhishGuard v5 API is live — 48 hybrid features!",
	"author" : "Uzman Zahid — Dublin Business School 2026",
	"docs" : "/docs"
	}

	@app.post("/predict", response_model=PredictionResponse)
	def predict(request: URLRequest):
	"""
	Predicts whether a URL is phishing or legitimate.
	Returns prediction (0=legitimate, 1=phishing),
	label, confidence score, and message.
	"""
	url = request.url.strip()
	if not url:
	raise HTTPException(
	status_code=400,
	detail="URL cannot be empty"
	)

	feat_dict = extract_features(url)
	X = pd.DataFrame([feat_dict])[features]
	prediction = int(model.predict(X)[0])
	probability = model.predict_proba(X)[0]
	confidence = float(round(max(probability), 4))
	label = "phishing" if prediction == 1 else "legitimate"
	message = (
	f"⚠️ WARNING: This URL appears to be PHISHING! "
	f"({confidence*100:.1f}% confidence)"
	if prediction == 1 else
	f"✅ This URL appears to be LEGITIMATE. "
	f"({confidence*100:.1f}% confidence)"
	)

	return PredictionResponse(
	url = url,
	prediction = prediction,
	label = label,
	confidence = confidence,
	message = message
	)

	@app.post("/predict/batch")
	def predict_batch(urls: list[str]):
	"""
	Predicts multiple URLs at once.
	Accepts a list of URL strings.
	Returns array of predictions with labels and confidence.
	"""
	results = []
	for url in urls:
	try:
	feat_dict = extract_features(url.strip())
	X = pd.DataFrame([feat_dict])[features]
	prediction = int(model.predict(X)[0])
	probability = model.predict_proba(X)[0]
	confidence = float(round(max(probability), 4))
	results.append({
	"url" : url,
	"prediction" : prediction,
	"label" : "phishing" if prediction == 1 else "legitimate",
	"confidence" : confidence
	})
	except Exception as e:
	results.append({
	"url" : url,
	"error": str(e)
	})
	return {
	"results": results,
	"total" : len(results),
	"model" : "PhishGuard v5"
	}