""" PhishLens Central Configuration Module. All hyperparameter grids, feature engineering thresholds, brand lists, risk TLD lists, and API endpoint constants are centralised here. Modify this file — not scattered magic numbers — to tune PhishLens behaviour. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Dict, List, Any # --------------------------------------------------------------------------- # Dataset & Training Constants # --------------------------------------------------------------------------- RANDOM_STATE: int = 42 TEST_SIZE: float = 0.20 # 80/20 stratified split CV_FOLDS: int = 5 # Stratified k-fold cross-validation OPTUNA_TRIALS: int = 50 # Bayesian hyperparameter search trials EMBEDDING_MODEL: str = "all-MiniLM-L6-v2" # 80MB; load once via cache TFIDF_MAX_FEATURES: int = 500 TFIDF_NGRAM_RANGE: tuple = (1, 2) # --------------------------------------------------------------------------- # Feature Engineering Thresholds # --------------------------------------------------------------------------- DOMAIN_AGE_RISK_DAYS: int = 30 # Domains < 30 days old → risk score 1.0 DOMAIN_AGE_WARN_DAYS: int = 90 # 30–90 days → risk score 0.5 MIN_URL_ENTROPY: float = 3.5 # Below this = low-entropy, likely benign CERT_LETS_ENCRYPT_RISK: float = 0.6 # LE cert alone is not conclusive ANOMALY_CONTAMINATION: float = 0.05 # Isolation Forest contamination rate WHOIS_TIMEOUT: int = 2 # Seconds before WHOIS fallback to -1 NETWORK_TIMEOUT: int = 3 # Seconds for crt.sh / API calls EMBEDDING_MAX_TOKENS: int = 512 # Truncate body before embedding # --------------------------------------------------------------------------- # Top 50 Spoofed Brands (used in brand impersonation feature) # --------------------------------------------------------------------------- BRAND_LIST: List[str] = [ # Global tech & e-commerce "microsoft", "apple", "google", "amazon", "netflix", "paypal", "dropbox", "docusign", "zoom", "adobe", "spotify", "linkedin", "facebook", "instagram", "twitter", "whatsapp", "telegram", # Financial — global "wellsfargo", "bankofamerica", "chase", "citibank", "hsbc", "barclays", "santander", "natwest", # Financial — Ireland-specific (SOC relevance for Irish roles) "aib", "bankofi", "bankofireland", "ulsterbank", "permanenttsb", "kbc", "revenuecie", "revenue", "anpost", # Logistics & shipping "dhl", "fedex", "ups", "usps", "anpost", "royalmail", "dpd", # Healthcare & government "nhs", "hse", "hmrc", "irs", "gov", # Cloud / SaaS "salesforce", "slack", "office365", "onedrive", "sharepoint", "icloud", "outlook", "gmail", ] # --------------------------------------------------------------------------- # Risk TLD List # --------------------------------------------------------------------------- RISK_TLD_LIST: List[str] = [ ".xyz", ".top", ".click", ".tk", ".ml", ".ga", ".cf", ".gq", ".icu", ".online", ".site", ".work", ".live", ".tech", ".pw", ".cc", ".biz", ".info", ".mobi", ".name", ] SAFE_TLD_LIST: List[str] = [ ".com", ".org", ".net", ".edu", ".gov", ".ie", ".co.uk", ".co.ie", ".ac.uk", ".ac.ie", ".gov.uk", ".gov.ie", ] # --------------------------------------------------------------------------- # Known URL Shortener Domains # --------------------------------------------------------------------------- URL_SHORTENER_DOMAINS: List[str] = [ "bit.ly", "tinyurl.com", "ow.ly", "t.co", "goo.gl", "rebrand.ly", "buff.ly", "adf.ly", "short.link", "cutt.ly", "is.gd", "v.gd", "tiny.cc", "bl.ink", "soo.gd", "s2r.co", "clck.ru", "tr.im", ] # --------------------------------------------------------------------------- # Suspicious URL Keywords # --------------------------------------------------------------------------- SUSPICIOUS_URL_KEYWORDS: List[str] = [ "login", "verify", "secure", "update", "confirm", "account", "banking", "webscr", "cmd=", "token=", "password", "credential", "signin", "logon", "auth", "reset", "recover", "unlock", "suspended", "validate", "authorize", ] # --------------------------------------------------------------------------- # Urgency / Social-Engineering Phrases (body text scoring) # --------------------------------------------------------------------------- URGENCY_PHRASES: List[str] = [ "urgent", "immediately", "verify now", "account suspended", "unusual activity", "click here", "limited time", "you have been selected", "congratulations", "security alert", "action required", "your account will be closed", "within 24 hours", "expires today", "final notice", "important notice", "immediate action", "your password has been compromised", "verify your identity", "update your information", "confirm your details", # Modern phishing lures: "sign-in attempt", "signin attempt", "unusual sign-in", "one-time password", "one time password", "enter your otp", "delivery failed", "parcel could not be delivered", "package on hold", "claim your prize", "you have won", "you are a winner", "invoice attached", "payment overdue", "payment declined", "your account has been locked", "we detected suspicious", "confirm your email", "validate your account", "your subscription has expired", "reactivate your account", "refund pending", "tax refund", "wire transfer", ] # --------------------------------------------------------------------------- # Suspicious X-Mailer strings (bulk-sender fingerprints) # --------------------------------------------------------------------------- SUSPICIOUS_XMAILER_PATTERNS: List[str] = [ # Confirmed bulk / mass-mailing tools — safe to flag "phpmailer", "sendblaster", "gmass", "massmailer", "bulkmail", # ESP platforms sometimes abused for phishing delivery "mailchimp", "sendgrid", "brevo", "constantcontact", # Note: 'postfix' / 'exim' are MTA names, NOT suspicious on their own. # Note: do NOT include "" or "unknown" — empty X-Mailer is NORMAL for # Gmail, Outlook Web, Apple Mail, and Yahoo Mail. ] # --------------------------------------------------------------------------- # Known-Abuse Registrars (domain registration risk signal) # --------------------------------------------------------------------------- ABUSE_REGISTRARS: List[str] = [ "namecheap", "godaddy", "tucows", "public domain registry", "pdricann", "internet domain service", "alibaba cloud", "west263", "bizcn", "hichina", ] # --------------------------------------------------------------------------- # Freemail Domains (reply-to freemail = social engineering signal) # --------------------------------------------------------------------------- FREEMAIL_DOMAINS: List[str] = [ "gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "aol.com", "protonmail.com", "mail.com", "icloud.com", "yandex.com", "tutanota.com", "gmx.com", "live.com", ] # --------------------------------------------------------------------------- # MITRE ATT&CK Technique Mapping # --------------------------------------------------------------------------- ATTACK_TECHNIQUE_MAP: Dict[str, Dict[str, str]] = { # Keys = exact feature names from the ML feature vector "url_url_shortener_max": { "technique_id": "T1566.002", "technique_name": "Phishing: Spearphishing Link", "tactic": "Initial Access", }, "url_domain_age_risk": { "technique_id": "T1583.001", "technique_name": "Acquire Infrastructure: Domains", "tactic": "Resource Development", }, "url_punycode_detected_max": { "technique_id": "T1036.007", "technique_name": "Masquerading: Double File Extension", "tactic": "Defense Evasion", }, "html_external_form_action": { "technique_id": "T1056.003", "technique_name": "Input Capture: Web Portal Capture", "tactic": "Collection", }, "url_cert_brand_mismatch": { "technique_id": "T1036", "technique_name": "Masquerading", "tactic": "Defense Evasion", }, "hdr_received_geo_anomaly": { "technique_id": "T1071.003", "technique_name": "Application Layer Protocol: Mail Protocols", "tactic": "Command and Control", }, "url_is_ip_address_max": { "technique_id": "T1583.005", "technique_name": "Acquire Infrastructure: Botnet", "tactic": "Resource Development", }, "hdr_from_reply_to_mismatch": { "technique_id": "T1656", "technique_name": "Impersonation", "tactic": "Defense Evasion", }, } # --------------------------------------------------------------------------- # Model Hyperparameter Grids (Optuna search space) # --------------------------------------------------------------------------- XGBOOST_PARAM_GRID: Dict[str, Any] = { "learning_rate": (0.01, 0.3), # log-uniform "max_depth": (3, 10), # int "n_estimators": (100, 1000), # int "subsample": (0.6, 1.0), "colsample_bytree": (0.6, 1.0), "min_child_weight": (1, 10), # int "gamma": (0.0, 0.5), } RF_PARAM_GRID: Dict[str, Any] = { "n_estimators": (100, 500), # int "max_depth": (5, 30), # int; None = unlimited → use high int "min_samples_split": (2, 20), # int "max_features": ["sqrt", "log2"], } LR_PARAM_GRID: Dict[str, Any] = { "C": (0.001, 100.0), # log-uniform "solver": ["lbfgs", "saga"], "max_iter": [500, 1000, 2000], } CATBOOST_PARAM_GRID: Dict[str, Any] = { "iterations": (200, 1000), "learning_rate": (0.01, 0.3), "depth": (4, 10), "l2_leaf_reg": (1, 10), } # --------------------------------------------------------------------------- # API Endpoints # --------------------------------------------------------------------------- API_ENDPOINTS: Dict[str, str] = { "virustotal_url": "https://www.virustotal.com/api/v3/urls/{url_id}", "virustotal_submit": "https://www.virustotal.com/api/v3/urls", "google_safe_browsing": "https://safebrowsing.googleapis.com/v4/threatMatches:find", "abuseipdb_check": "https://api.abuseipdb.com/api/v2/check", "urlscan_search": "https://urlscan.io/api/v1/search/", "urlscan_submit": "https://urlscan.io/api/v1/scan/", "urlscan_result": "https://urlscan.io/api/v1/result/{uuid}/", "crtsh": "https://crt.sh/?q={domain}&output=json", "urlhaus_lookup": "https://urlhaus-api.abuse.ch/v1/url/", } # --------------------------------------------------------------------------- # Config Dataclass (passed around the codebase) # --------------------------------------------------------------------------- @dataclass class PhishLensConfig: """Central configuration object for PhishLens. Instantiate once and pass to all modules that need thresholds or params. """ random_state: int = RANDOM_STATE test_size: float = TEST_SIZE cv_folds: int = CV_FOLDS optuna_trials: int = OPTUNA_TRIALS embedding_model: str = EMBEDDING_MODEL tfidf_max_features: int = TFIDF_MAX_FEATURES tfidf_ngram_range: tuple = field(default_factory=lambda: TFIDF_NGRAM_RANGE) domain_age_risk_days: int = DOMAIN_AGE_RISK_DAYS domain_age_warn_days: int = DOMAIN_AGE_WARN_DAYS min_url_entropy: float = MIN_URL_ENTROPY anomaly_contamination: float = ANOMALY_CONTAMINATION whois_timeout: int = WHOIS_TIMEOUT network_timeout: int = NETWORK_TIMEOUT embedding_max_tokens: int = EMBEDDING_MAX_TOKENS brand_list: List[str] = field(default_factory=lambda: BRAND_LIST) risk_tld_list: List[str] = field(default_factory=lambda: RISK_TLD_LIST) safe_tld_list: List[str] = field(default_factory=lambda: SAFE_TLD_LIST) url_shortener_domains: List[str] = field(default_factory=lambda: URL_SHORTENER_DOMAINS) suspicious_url_keywords: List[str] = field(default_factory=lambda: SUSPICIOUS_URL_KEYWORDS) urgency_phrases: List[str] = field(default_factory=lambda: URGENCY_PHRASES) freemail_domains: List[str] = field(default_factory=lambda: FREEMAIL_DOMAINS) abuse_registrars: List[str] = field(default_factory=lambda: ABUSE_REGISTRARS) prediction_threshold: float = 0.5 def __repr__(self) -> str: return ( f"PhishLensConfig(" f"random_state={self.random_state}, " f"cv_folds={self.cv_folds}, " f"embedding_model='{self.embedding_model}', " f"optuna_trials={self.optuna_trials})" ) # Singleton default config — import this directly where no customisation needed DEFAULT_CONFIG = PhishLensConfig()