# ============================================================
# PhishGuard v5 — HuggingFace Spaces Deployment
# File   : app.py
# Port   : 7860 (required by HuggingFace)
#
# Endpoints:
#   GET  /          → health check
#   POST /predict   → predict if URL is phishing or legitimate
#   POST /predict/batch → predict multiple URLs at once
#
# Features: 48 hybrid features
#   42 structural + 2 knowledge-based + 4 pure structural
#
# Author : Uzman Zahid
# Dublin Business School — 2026
# ============================================================

from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import joblib
import json
import re
import math
import pandas as pd
import os

# ── 1. INITIALISE APP ────────────────────────────────────────
app = FastAPI(
    title       = "PhishGuard v5 API",
    description = "Real-time phishing URL detection using Machine Learning — 48 hybrid features — Uzman Zahid, Dublin Business School 2026",
    version     = "5.0.0"
)

# ── 2. CORS ───────────────────────────────────────────────────
app.add_middleware(
    CORSMiddleware,
    allow_origins     = ["*"],
    allow_credentials = True,
    allow_methods     = ["*"],
    allow_headers     = ["*"],
)

# ── 3. LOAD MODEL ────────────────────────────────────────────
print("Loading PhishGuard v5 model...")
model_path = "model_compressed.pkl" if os.path.exists("model_compressed.pkl") else "model.pkl"
try:
    model = joblib.load(model_path)
    print(f"✅ Model loaded from {model_path}")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    raise

try:
    with open("features.json") as f:
        features = json.load(f)
    print(f"✅ Features loaded: {len(features)} features")
except Exception as e:
    print(f"❌ Error loading features: {e}")
    raise

# ── 4. KNOWLEDGE LOOKUP TABLES ───────────────────────────────
# Based on APWG Phishing Activity Trends Reports (2016-2024)
SUSPICIOUS_TLDS = {
    'tk', 'ml', 'ga', 'cf', 'gq',
    'xyz', 'top', 'club', 'online', 'site',
    'fun', 'icu', 'vip', 'cyou', 'lat',
    'space', 'live', 'pw', 'cc', 'su',
    'ws', 'bz', 'name', 'mobi', 'link',
    'click', 'download', 'loan', 'win',
    'racing', 'stream', 'trade', 'review',
    'accountant', 'science', 'work', 'party',
    'faith', 'date', 'cricket', 'ninja',
    'bid', 'webcam', 'rocks', 'country',
}

# Well-established legitimate second-level domains
KNOWN_SAFE_SLDS = {
    'google', 'microsoft', 'apple', 'amazon',
    'facebook', 'youtube', 'netflix', 'github',
    'stackoverflow', 'wikipedia', 'twitter',
    'linkedin', 'instagram', 'spotify', 'stripe',
    'paypal', 'dropbox', 'slack', 'zoom',
    'cloudflare', 'netlify', 'vercel', 'heroku',
    'reddit', 'pinterest', 'tiktok', 'discord',
    'notion', 'figma', 'canva', 'shopify',
    'wordpress', 'adobe', 'salesforce', 'hubspot',
    'openai', 'anthropic', 'huggingface', 'kaggle',
    'coursera', 'udemy', 'edx', 'khanacademy',
    'duolingo', 'bbc', 'cnn', 'reuters', 'bloomberg',
    'techcrunch', 'mit', 'stanford', 'harvard',
    'oxford', 'cambridge', 'dbs', 'ucd', 'tcd', 'dcu',
    'npmjs', 'pypi', 'kubernetes', 'mozilla', 'docker',
    'spinbot', 'ilovepdf', 'smallpdf', 'mp3cut',
    'convertio', 'tinypng', 'grammarly',
    'wise', 'revolut', 'coinbase', 'binance',
    'vimeo', 'twitch', 'dailymotion', 'soundcloud',
    'gov', 'nasa', 'digitalocean',
}

# ── 5. REQUEST / RESPONSE MODELS ─────────────────────────────
class URLRequest(BaseModel):
    url: str

class PredictionResponse(BaseModel):
    url        : str
    prediction : int
    label      : str
    confidence : float
    message    : str

# ── 6. FEATURE EXTRACTION (48 HYBRID FEATURES) ───────────────
def extract_features(raw_url):
    """
    Extracts 48 hybrid features from any URL format.
    42 structural + 2 knowledge-based + 4 pure structural.
    Identical to feature extraction used during training.
    """
    try:
        url = str(raw_url).strip()

        # ── Scheme ────────────────────────────────────────────
        has_https = 1 if url.lower().startswith('https://') else 0
        has_http  = 1 if url.lower().startswith('http://') else 0

        # ── Parse components ──────────────────────────────────
        url_no_scheme = url
        if '://' in url:
            url_no_scheme = url.split('://', 1)[1]

        domain_with_port = url_no_scheme.split('/')[0].split('?')[0].split('#')[0]
        domain_clean     = domain_with_port.split(':')[0]
        parts            = url_no_scheme.split('/', 1)
        path             = '/' + parts[1] if len(parts) > 1 else ''
        query            = url.split('?', 1)[1].split('#')[0] if '?' in url else ''
        fragment         = url.split('#', 1)[1] if '#' in url else ''

        domain_no_www = domain_clean
        if domain_no_www.lower().startswith('www.'):
            domain_no_www = domain_no_www[4:]

        if domain_clean.lower().startswith('www.'):
            domain_normalised = domain_clean
        else:
            domain_normalised = 'www.' + domain_clean

        domain_parts_full = domain_normalised.split('.')

        # ── TLD and SLD ───────────────────────────────────────
        domain_parts = domain_no_www.split('.')
        tld = domain_parts[-1].lower() if domain_parts else ''
        sld = domain_parts[-2].lower() if len(domain_parts) >= 2 else ''
        if len(domain_parts) >= 3 and domain_parts[-1] in [
            'uk','au','in','jp','nz','za','br','sg','ie','pk'
        ]:
            tld = f"{domain_parts[-2]}.{domain_parts[-1]}"
            sld = domain_parts[-3].lower() if len(domain_parts) >= 3 else ''

        # ── URL features ──────────────────────────────────────
        url_length                    = len(url)
        number_of_dots_in_url         = url.count('.')
        digits_in_url                 = re.findall(r'\d', url)
        having_repeated_digits_in_url = 1 if len(digits_in_url) != len(
            set(digits_in_url)) and len(digits_in_url) > 0 else 0
        number_of_digits_in_url       = sum(c.isdigit() for c in url)
        number_of_special_char_in_url = sum(not c.isalnum() for c in url)
        number_of_hyphens_in_url      = url.count('-')
        number_of_underline_in_url    = url.count('_')
        number_of_slash_in_url        = url.count('/')
        number_of_questionmark_in_url = url.count('?')
        number_of_equal_in_url        = url.count('=')
        number_of_at_in_url           = url.count('@')
        number_of_dollar_in_url       = url.count('$')
        number_of_exclamation_in_url  = url.count('!')
        number_of_hashtag_in_url      = url.count('#')
        number_of_percent_in_url      = url.count('%')

        # ── Domain features ───────────────────────────────────
        domain_length                          = len(domain_no_www)
        number_of_dots_in_domain               = domain_no_www.count('.')
        number_of_hyphens_in_domain            = domain_no_www.count('-')
        having_special_characters_in_domain    = 1 if re.search(
            r'[^a-zA-Z0-9\.\-]', domain_no_www) else 0
        number_of_special_characters_in_domain = sum(
            not c.isalnum() and c not in '.-' for c in domain_no_www)
        having_digits_in_domain                = 1 if any(
            c.isdigit() for c in domain_no_www) else 0
        number_of_digits_in_domain             = sum(
            c.isdigit() for c in domain_no_www)
        digits_in_domain                       = re.findall(r'\d', domain_no_www)
        having_repeated_digits_in_domain       = 1 if len(
            digits_in_domain) != len(set(digits_in_domain)) and \
            len(digits_in_domain) > 0 else 0

        # ── Subdomain features ────────────────────────────────
        number_of_subdomains = max(0, len(domain_parts_full) - 2)
        subdomains           = domain_parts_full[:-2] if len(
            domain_parts_full) > 2 else []
        subdomain_depth      = len(subdomains)

        having_hyphen_in_subdomain             = 1 if any(
            '-' in s for s in subdomains) else 0
        average_subdomain_length               = sum(
            len(s) for s in subdomains) / len(subdomains) \
            if subdomains else 0.0
        average_number_of_hyphens_in_subdomain = sum(
            s.count('-') for s in subdomains) / len(subdomains) \
            if subdomains else 0.0
        having_special_characters_in_subdomain = 1 if any(
            re.search(r'[^a-zA-Z0-9\-]', s)
            for s in subdomains) else 0
        number_of_special_characters_in_subdomain = sum(
            sum(not c.isalnum() and c != '-' for c in s)
            for s in subdomains)
        having_digits_in_subdomain             = 1 if any(
            any(c.isdigit() for c in s) for s in subdomains) else 0
        number_of_digits_in_subdomain          = sum(
            sum(c.isdigit() for c in s) for s in subdomains)
        all_sub_digits                         = re.findall(
            r'\d', ''.join(subdomains))
        having_repeated_digits_in_subdomain    = 1 if len(
            all_sub_digits) != len(set(all_sub_digits)) and \
            len(all_sub_digits) > 0 else 0

        # ── Path/Query features ───────────────────────────────
        having_path     = 1 if len(path) > 1 else 0
        path_segments   = [p for p in path.split('/') if p]
        path_length     = len(path_segments)
        having_query    = 1 if len(query) > 0 else 0
        having_fragment = 1 if len(fragment) > 0 else 0
        having_anchor   = 1 if '#' in url else 0

        # ── Entropy features ──────────────────────────────────
        if len(url) > 0:
            prob_url       = [url.count(c)/len(url) for c in set(url)]
            entropy_of_url = -sum(p*math.log2(p) for p in prob_url if p > 0)
        else:
            entropy_of_url = 0.0

        if len(domain_no_www) > 0:
            prob_dom          = [domain_no_www.count(c)/len(domain_no_www)
                                 for c in set(domain_no_www)]
            entropy_of_domain = -sum(p*math.log2(p) for p in prob_dom if p > 0)
        else:
            entropy_of_domain = 0.0

        # ── NEW FEATURE 43: has_suspicious_tld ────────────────
        # Knowledge-based: APWG documented high-abuse TLDs
        has_suspicious_tld = 1 if tld.lower() in SUSPICIOUS_TLDS else 0

        # ── NEW FEATURE 44: is_known_safe_sld ─────────────────
        # Knowledge-based: established legitimate platforms
        is_known_safe_sld = 1 if sld.lower() in KNOWN_SAFE_SLDS else 0

        # ── NEW FEATURE 45: consonant_vowel_ratio ─────────────
        # Pure structural: unnatural domains = phishing signal
        vowels          = set('aeiouAEIOU')
        letters         = [c for c in domain_no_www if c.isalpha()]
        vowel_count     = sum(1 for c in letters if c in vowels)
        consonant_count = sum(1 for c in letters if c not in vowels)
        consonant_vowel_ratio = round(
            consonant_count / (vowel_count + 1), 4)

        # ── NEW FEATURE 46: longest_digit_sequence ────────────
        # Pure structural: digit runs indicate random generation
        digit_sequences   = re.findall(r'\d+', domain_no_www)
        longest_digit_seq = max(
            (len(s) for s in digit_sequences), default=0)

        # ── NEW FEATURE 47: digit_letter_ratio ────────────────
        # Pure structural: digit-heavy domains = phishing
        alpha_count        = sum(c.isalpha() for c in domain_no_www)
        digit_count        = sum(c.isdigit() for c in domain_no_www)
        digit_letter_ratio = round(
            digit_count / (alpha_count + 1), 4)

        # ── NEW FEATURE 48: path_to_url_ratio ─────────────────
        # Pure structural: bare phishing domains have ratio = 0
        path_to_url_ratio = round(
            len(path) / len(url), 4) if len(url) > 0 else 0.0

        return {
            'has_https'                                : has_https,
            'has_http'                                 : has_http,
            'url_length'                               : url_length,
            'number_of_dots_in_url'                    : number_of_dots_in_url,
            'having_repeated_digits_in_url'            : having_repeated_digits_in_url,
            'number_of_digits_in_url'                  : number_of_digits_in_url,
            'number_of_special_char_in_url'            : number_of_special_char_in_url,
            'number_of_hyphens_in_url'                 : number_of_hyphens_in_url,
            'number_of_underline_in_url'               : number_of_underline_in_url,
            'number_of_slash_in_url'                   : number_of_slash_in_url,
            'number_of_questionmark_in_url'            : number_of_questionmark_in_url,
            'number_of_equal_in_url'                   : number_of_equal_in_url,
            'number_of_at_in_url'                      : number_of_at_in_url,
            'number_of_dollar_in_url'                  : number_of_dollar_in_url,
            'number_of_exclamation_in_url'             : number_of_exclamation_in_url,
            'number_of_hashtag_in_url'                 : number_of_hashtag_in_url,
            'number_of_percent_in_url'                 : number_of_percent_in_url,
            'domain_length'                            : domain_length,
            'number_of_dots_in_domain'                 : number_of_dots_in_domain,
            'number_of_hyphens_in_domain'              : number_of_hyphens_in_domain,
            'having_special_characters_in_domain'      : having_special_characters_in_domain,
            'number_of_special_characters_in_domain'   : number_of_special_characters_in_domain,
            'having_digits_in_domain'                  : having_digits_in_domain,
            'number_of_digits_in_domain'               : number_of_digits_in_domain,
            'having_repeated_digits_in_domain'         : having_repeated_digits_in_domain,
            'number_of_subdomains'                     : number_of_subdomains,
            'subdomain_depth'                          : subdomain_depth,
            'having_hyphen_in_subdomain'               : having_hyphen_in_subdomain,
            'average_subdomain_length'                 : average_subdomain_length,
            'average_number_of_hyphens_in_subdomain'   : average_number_of_hyphens_in_subdomain,
            'having_special_characters_in_subdomain'   : having_special_characters_in_subdomain,
            'number_of_special_characters_in_subdomain': number_of_special_characters_in_subdomain,
            'having_digits_in_subdomain'               : having_digits_in_subdomain,
            'number_of_digits_in_subdomain'            : number_of_digits_in_subdomain,
            'having_repeated_digits_in_subdomain'      : having_repeated_digits_in_subdomain,
            'having_path'                              : having_path,
            'path_length'                              : path_length,
            'having_query'                             : having_query,
            'having_fragment'                          : having_fragment,
            'having_anchor'                            : having_anchor,
            'entropy_of_url'                           : entropy_of_url,
            'entropy_of_domain'                        : entropy_of_domain,
            # New 6 hybrid features
            'has_suspicious_tld'                       : has_suspicious_tld,
            'is_known_safe_sld'                        : is_known_safe_sld,
            'consonant_vowel_ratio'                    : consonant_vowel_ratio,
            'longest_digit_sequence'                   : longest_digit_seq,
            'digit_letter_ratio'                       : digit_letter_ratio,
            'path_to_url_ratio'                        : path_to_url_ratio,
        }
    except Exception as e:
        raise HTTPException(
            status_code=400,
            detail=f"Feature extraction failed: {str(e)}"
        )

# ── 7. ENDPOINTS ─────────────────────────────────────────────
@app.get("/")
def health_check():
    return {
        "status"   : "running",
        "model"    : "PhishGuard v5",
        "features" : len(features),
        "version"  : "5.0.0",
        "message"  : "PhishGuard v5 API is live — 48 hybrid features!",
        "author"   : "Uzman Zahid — Dublin Business School 2026",
        "docs"     : "/docs"
    }

@app.post("/predict", response_model=PredictionResponse)
def predict(request: URLRequest):
    """
    Predicts whether a URL is phishing or legitimate.
    Returns prediction (0=legitimate, 1=phishing),
    label, confidence score, and message.
    """
    url = request.url.strip()
    if not url:
        raise HTTPException(
            status_code=400,
            detail="URL cannot be empty"
        )

    feat_dict   = extract_features(url)
    X           = pd.DataFrame([feat_dict])[features]
    prediction  = int(model.predict(X)[0])
    probability = model.predict_proba(X)[0]
    confidence  = float(round(max(probability), 4))
    label       = "phishing" if prediction == 1 else "legitimate"
    message     = (
        f"⚠️ WARNING: This URL appears to be PHISHING! "
        f"({confidence*100:.1f}% confidence)"
        if prediction == 1 else
        f"✅ This URL appears to be LEGITIMATE. "
        f"({confidence*100:.1f}% confidence)"
    )

    return PredictionResponse(
        url        = url,
        prediction = prediction,
        label      = label,
        confidence = confidence,
        message    = message
    )

@app.post("/predict/batch")
def predict_batch(urls: list[str]):
    """
    Predicts multiple URLs at once.
    Accepts a list of URL strings.
    Returns array of predictions with labels and confidence.
    """
    results = []
    for url in urls:
        try:
            feat_dict   = extract_features(url.strip())
            X           = pd.DataFrame([feat_dict])[features]
            prediction  = int(model.predict(X)[0])
            probability = model.predict_proba(X)[0]
            confidence  = float(round(max(probability), 4))
            results.append({
                "url"        : url,
                "prediction" : prediction,
                "label"      : "phishing" if prediction == 1 else "legitimate",
                "confidence" : confidence
            })
        except Exception as e:
            results.append({
                "url"  : url,
                "error": str(e)
            })
    return {
        "results": results,
        "total"  : len(results),
        "model"  : "PhishGuard v5"
    }