phishguard / app.py
Uzmann's picture
features added
704236b
# ============================================================
# PhishGuard β€” HuggingFace Spaces Deployment
# File : app.py
# Port : 7860 (required by HuggingFace)
#
# Endpoints:
# GET / β†’ health check
# POST /predict β†’ predict if URL is phishing
#
# Author : Uzman Zahid
# ============================================================
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import joblib
import json
import re
import math
import pandas as pd
import os
# ── 1. INITIALISE APP ────────────────────────────────────────
app = FastAPI(
title = "PhishGuard API",
description = "Real-time phishing URL detection using Machine Learning β€” Uzman Zahid, Dublin Business School 2026",
version = "3.0.0"
)
# ── 2. CORS β€” Allow Chrome Extension to call API ─────────────
app.add_middleware(
CORSMiddleware,
allow_origins = ["*"],
allow_credentials = True,
allow_methods = ["*"],
allow_headers = ["*"],
)
# ── 3. LOAD MODEL ────────────────────────────────────────────
print("Loading PhishGuard model...")
# try compressed first, then original
model_path = "model_compressed.pkl" if os.path.exists("model_compressed.pkl") else "model.pkl"
try:
model = joblib.load(model_path)
print(f"βœ… Model loaded from {model_path}")
except Exception as e:
print(f"❌ Error loading model: {e}")
raise
# load features
try:
with open("features.json") as f:
features = json.load(f)
print(f"βœ… Features loaded: {len(features)} features")
except Exception as e:
print(f"❌ Error loading features: {e}")
raise
# ── 4. REQUEST / RESPONSE MODELS ────────────────────────────
class URLRequest(BaseModel):
url: str
class PredictionResponse(BaseModel):
url : str
prediction : int
label : str
confidence : float
message : str
# ── 5. FEATURE EXTRACTION ────────────────────────────────────
def extract_features(raw_url):
"""
Extracts 42 features from any URL format.
Exact same function used during model training.
"""
try:
url = str(raw_url).strip()
has_https = 1 if url.lower().startswith('https://') else 0
has_http = 1 if url.lower().startswith('http://') else 0
url_no_scheme = url
if '://' in url:
url_no_scheme = url.split('://', 1)[1]
domain_with_port = url_no_scheme.split('/')[0].split('?')[0].split('#')[0]
domain_clean = domain_with_port.split(':')[0]
parts = url_no_scheme.split('/', 1)
path = '/' + parts[1] if len(parts) > 1 else ''
query = url.split('?', 1)[1].split('#')[0] if '?' in url else ''
fragment = url.split('#', 1)[1] if '#' in url else ''
domain_no_www = domain_clean
if domain_no_www.lower().startswith('www.'):
domain_no_www = domain_no_www[4:]
if domain_clean.lower().startswith('www.'):
domain_normalised = domain_clean
else:
domain_normalised = 'www.' + domain_clean
domain_parts_full = domain_normalised.split('.')
url_length = len(url)
number_of_dots_in_url = url.count('.')
digits_in_url = re.findall(r'\d', url)
having_repeated_digits_in_url = 1 if len(digits_in_url) != len(
set(digits_in_url)) and len(digits_in_url) > 0 else 0
number_of_digits_in_url = sum(c.isdigit() for c in url)
number_of_special_char_in_url = sum(not c.isalnum() for c in url)
number_of_hyphens_in_url = url.count('-')
number_of_underline_in_url = url.count('_')
number_of_slash_in_url = url.count('/')
number_of_questionmark_in_url = url.count('?')
number_of_equal_in_url = url.count('=')
number_of_at_in_url = url.count('@')
number_of_dollar_in_url = url.count('$')
number_of_exclamation_in_url = url.count('!')
number_of_hashtag_in_url = url.count('#')
number_of_percent_in_url = url.count('%')
domain_length = len(domain_no_www)
number_of_dots_in_domain = domain_no_www.count('.')
number_of_hyphens_in_domain = domain_no_www.count('-')
having_special_characters_in_domain = 1 if re.search(
r'[^a-zA-Z0-9\.\-]', domain_no_www) else 0
number_of_special_characters_in_domain = sum(
not c.isalnum() and c not in '.-' for c in domain_no_www)
having_digits_in_domain = 1 if any(
c.isdigit() for c in domain_no_www) else 0
number_of_digits_in_domain = sum(
c.isdigit() for c in domain_no_www)
digits_in_domain = re.findall(r'\d', domain_no_www)
having_repeated_digits_in_domain = 1 if len(
digits_in_domain) != len(set(digits_in_domain)) and \
len(digits_in_domain) > 0 else 0
number_of_subdomains = max(0, len(domain_parts_full) - 2)
subdomains = domain_parts_full[:-2] if len(
domain_parts_full) > 2 else []
subdomain_depth = len(subdomains)
having_hyphen_in_subdomain = 1 if any(
'-' in s for s in subdomains) else 0
average_subdomain_length = sum(
len(s) for s in subdomains) / len(subdomains) \
if subdomains else 0.0
average_number_of_hyphens_in_subdomain = sum(
s.count('-') for s in subdomains) / len(subdomains) \
if subdomains else 0.0
having_special_characters_in_subdomain = 1 if any(
re.search(r'[^a-zA-Z0-9\-]', s)
for s in subdomains) else 0
number_of_special_characters_in_subdomain = sum(
sum(not c.isalnum() and c != '-' for c in s)
for s in subdomains)
having_digits_in_subdomain = 1 if any(
any(c.isdigit() for c in s) for s in subdomains) else 0
number_of_digits_in_subdomain = sum(
sum(c.isdigit() for c in s) for s in subdomains)
all_sub_digits = re.findall(
r'\d', ''.join(subdomains))
having_repeated_digits_in_subdomain = 1 if len(
all_sub_digits) != len(set(all_sub_digits)) and \
len(all_sub_digits) > 0 else 0
having_path = 1 if len(path) > 1 else 0
path_segments = [p for p in path.split('/') if p]
path_length = len(path_segments)
having_query = 1 if len(query) > 0 else 0
having_fragment = 1 if len(fragment) > 0 else 0
having_anchor = 1 if '#' in url else 0
if len(url) > 0:
prob_url = [url.count(c) / len(url) for c in set(url)]
entropy_of_url = -sum(p * math.log2(p) for p in prob_url if p > 0)
else:
entropy_of_url = 0.0
if len(domain_no_www) > 0:
prob_dom = [domain_no_www.count(c) / len(domain_no_www) for c in set(domain_no_www)]
entropy_of_domain = -sum(p * math.log2(p) for p in prob_dom if p > 0)
else:
entropy_of_domain = 0.0
return {
'has_https' : has_https,
'has_http' : has_http,
'url_length' : url_length,
'number_of_dots_in_url' : number_of_dots_in_url,
'having_repeated_digits_in_url' : having_repeated_digits_in_url,
'number_of_digits_in_url' : number_of_digits_in_url,
'number_of_special_char_in_url' : number_of_special_char_in_url,
'number_of_hyphens_in_url' : number_of_hyphens_in_url,
'number_of_underline_in_url' : number_of_underline_in_url,
'number_of_slash_in_url' : number_of_slash_in_url,
'number_of_questionmark_in_url' : number_of_questionmark_in_url,
'number_of_equal_in_url' : number_of_equal_in_url,
'number_of_at_in_url' : number_of_at_in_url,
'number_of_dollar_in_url' : number_of_dollar_in_url,
'number_of_exclamation_in_url' : number_of_exclamation_in_url,
'number_of_hashtag_in_url' : number_of_hashtag_in_url,
'number_of_percent_in_url' : number_of_percent_in_url,
'domain_length' : domain_length,
'number_of_dots_in_domain' : number_of_dots_in_domain,
'number_of_hyphens_in_domain' : number_of_hyphens_in_domain,
'having_special_characters_in_domain' : having_special_characters_in_domain,
'number_of_special_characters_in_domain' : number_of_special_characters_in_domain,
'having_digits_in_domain' : having_digits_in_domain,
'number_of_digits_in_domain' : number_of_digits_in_domain,
'having_repeated_digits_in_domain' : having_repeated_digits_in_domain,
'number_of_subdomains' : number_of_subdomains,
'subdomain_depth' : subdomain_depth,
'having_hyphen_in_subdomain' : having_hyphen_in_subdomain,
'average_subdomain_length' : average_subdomain_length,
'average_number_of_hyphens_in_subdomain' : average_number_of_hyphens_in_subdomain,
'having_special_characters_in_subdomain' : having_special_characters_in_subdomain,
'number_of_special_characters_in_subdomain': number_of_special_characters_in_subdomain,
'having_digits_in_subdomain' : having_digits_in_subdomain,
'number_of_digits_in_subdomain' : number_of_digits_in_subdomain,
'having_repeated_digits_in_subdomain' : having_repeated_digits_in_subdomain,
'having_path' : having_path,
'path_length' : path_length,
'having_query' : having_query,
'having_fragment' : having_fragment,
'having_anchor' : having_anchor,
'entropy_of_url' : entropy_of_url,
'entropy_of_domain' : entropy_of_domain,
}
except Exception as e:
raise HTTPException(status_code=400, detail=f"Feature extraction failed: {str(e)}")
# ── 6. ENDPOINTS ─────────────────────────────────────────────
@app.get("/")
def health_check():
return {
"status" : "running",
"model" : "PhishGuard v3",
"features": len(features),
"message" : "PhishGuard API is live on HuggingFace Spaces!"
}
@app.post("/predict", response_model=PredictionResponse)
def predict(request: URLRequest):
url = request.url.strip()
if not url:
raise HTTPException(status_code=400, detail="URL cannot be empty")
feat_dict = extract_features(url)
X = pd.DataFrame([feat_dict])[features]
prediction = int(model.predict(X)[0])
probability = model.predict_proba(X)[0]
confidence = float(round(max(probability), 4))
label = "phishing" if prediction == 1 else "legitimate"
message = f"⚠️ WARNING: This URL appears to be PHISHING! ({confidence*100:.1f}% confidence)" \
if prediction == 1 else \
f"βœ… This URL appears to be LEGITIMATE. ({confidence*100:.1f}% confidence)"
return PredictionResponse(
url = url,
prediction = prediction,
label = label,
confidence = confidence,
message = message
)