Spaces:

SagarTony90265
/

PhishSentinel

Sleeping

github-actions[bot]

Deploy to HF Spaces (ci)

0fd143d 25 days ago

12.8 kB

	"""
	PhishLens Central Configuration Module.

	All hyperparameter grids, feature engineering thresholds, brand lists,
	risk TLD lists, and API endpoint constants are centralised here.
	Modify this file — not scattered magic numbers — to tune PhishLens behaviour.
	"""

	from __future__ import annotations

	from dataclasses import dataclass, field
	from typing import Dict, List, Any


	# ---------------------------------------------------------------------------
	# Dataset & Training Constants
	# ---------------------------------------------------------------------------

	RANDOM_STATE: int = 42
	TEST_SIZE: float = 0.20 # 80/20 stratified split
	CV_FOLDS: int = 5 # Stratified k-fold cross-validation
	OPTUNA_TRIALS: int = 50 # Bayesian hyperparameter search trials
	EMBEDDING_MODEL: str = "all-MiniLM-L6-v2" # 80MB; load once via cache
	TFIDF_MAX_FEATURES: int = 500
	TFIDF_NGRAM_RANGE: tuple = (1, 2)


	# ---------------------------------------------------------------------------
	# Feature Engineering Thresholds
	# ---------------------------------------------------------------------------

	DOMAIN_AGE_RISK_DAYS: int = 30 # Domains < 30 days old → risk score 1.0
	DOMAIN_AGE_WARN_DAYS: int = 90 # 30–90 days → risk score 0.5
	MIN_URL_ENTROPY: float = 3.5 # Below this = low-entropy, likely benign
	CERT_LETS_ENCRYPT_RISK: float = 0.6 # LE cert alone is not conclusive
	ANOMALY_CONTAMINATION: float = 0.05 # Isolation Forest contamination rate
	WHOIS_TIMEOUT: int = 2 # Seconds before WHOIS fallback to -1
	NETWORK_TIMEOUT: int = 3 # Seconds for crt.sh / API calls
	EMBEDDING_MAX_TOKENS: int = 512 # Truncate body before embedding


	# ---------------------------------------------------------------------------
	# Top 50 Spoofed Brands (used in brand impersonation feature)
	# ---------------------------------------------------------------------------

	BRAND_LIST: List[str] = [
	# Global tech & e-commerce
	"microsoft", "apple", "google", "amazon", "netflix", "paypal",
	"dropbox", "docusign", "zoom", "adobe", "spotify", "linkedin",
	"facebook", "instagram", "twitter", "whatsapp", "telegram",
	# Financial — global
	"wellsfargo", "bankofamerica", "chase", "citibank", "hsbc",
	"barclays", "santander", "natwest",
	# Financial — Ireland-specific (SOC relevance for Irish roles)
	"aib", "bankofi", "bankofireland", "ulsterbank", "permanenttsb",
	"kbc", "revenuecie", "revenue", "anpost",
	# Logistics & shipping
	"dhl", "fedex", "ups", "usps", "anpost", "royalmail", "dpd",
	# Healthcare & government
	"nhs", "hse", "hmrc", "irs", "gov",
	# Cloud / SaaS
	"salesforce", "slack", "office365", "onedrive", "sharepoint",
	"icloud", "outlook", "gmail",
	]

	# ---------------------------------------------------------------------------
	# Risk TLD List
	# ---------------------------------------------------------------------------

	RISK_TLD_LIST: List[str] = [
	".xyz", ".top", ".click", ".tk", ".ml", ".ga", ".cf",
	".gq", ".icu", ".online", ".site", ".work", ".live", ".tech",
	".pw", ".cc", ".biz", ".info", ".mobi", ".name",
	]

	SAFE_TLD_LIST: List[str] = [
	".com", ".org", ".net", ".edu", ".gov", ".ie", ".co.uk",
	".co.ie", ".ac.uk", ".ac.ie", ".gov.uk", ".gov.ie",
	]

	# ---------------------------------------------------------------------------
	# Known URL Shortener Domains
	# ---------------------------------------------------------------------------

	URL_SHORTENER_DOMAINS: List[str] = [
	"bit.ly", "tinyurl.com", "ow.ly", "t.co", "goo.gl", "rebrand.ly",
	"buff.ly", "adf.ly", "short.link", "cutt.ly", "is.gd", "v.gd",
	"tiny.cc", "bl.ink", "soo.gd", "s2r.co", "clck.ru", "tr.im",
	]

	# ---------------------------------------------------------------------------
	# Suspicious URL Keywords
	# ---------------------------------------------------------------------------

	SUSPICIOUS_URL_KEYWORDS: List[str] = [
	"login", "verify", "secure", "update", "confirm", "account",
	"banking", "webscr", "cmd=", "token=", "password", "credential",
	"signin", "logon", "auth", "reset", "recover", "unlock",
	"suspended", "validate", "authorize",
	]

	# ---------------------------------------------------------------------------
	# Urgency / Social-Engineering Phrases (body text scoring)
	# ---------------------------------------------------------------------------

	URGENCY_PHRASES: List[str] = [
	"urgent", "immediately", "verify now", "account suspended",
	"unusual activity", "click here", "limited time", "you have been selected",
	"congratulations", "security alert", "action required",
	"your account will be closed", "within 24 hours", "expires today",
	"final notice", "important notice", "immediate action",
	"your password has been compromised", "verify your identity",
	"update your information", "confirm your details",
	# Modern phishing lures:
	"sign-in attempt", "signin attempt", "unusual sign-in",
	"one-time password", "one time password", "enter your otp",
	"delivery failed", "parcel could not be delivered", "package on hold",
	"claim your prize", "you have won", "you are a winner",
	"invoice attached", "payment overdue", "payment declined",
	"your account has been locked", "we detected suspicious",
	"confirm your email", "validate your account",
	"your subscription has expired", "reactivate your account",
	"refund pending", "tax refund", "wire transfer",
	]

	# ---------------------------------------------------------------------------
	# Suspicious X-Mailer strings (bulk-sender fingerprints)
	# ---------------------------------------------------------------------------

	SUSPICIOUS_XMAILER_PATTERNS: List[str] = [
	# Confirmed bulk / mass-mailing tools — safe to flag
	"phpmailer", "sendblaster", "gmass", "massmailer", "bulkmail",
	# ESP platforms sometimes abused for phishing delivery
	"mailchimp", "sendgrid", "brevo", "constantcontact",
	# Note: 'postfix' / 'exim' are MTA names, NOT suspicious on their own.
	# Note: do NOT include "" or "unknown" — empty X-Mailer is NORMAL for
	# Gmail, Outlook Web, Apple Mail, and Yahoo Mail.
	]

	# ---------------------------------------------------------------------------
	# Known-Abuse Registrars (domain registration risk signal)
	# ---------------------------------------------------------------------------

	ABUSE_REGISTRARS: List[str] = [
	"namecheap", "godaddy", "tucows", "public domain registry",
	"pdricann", "internet domain service", "alibaba cloud",
	"west263", "bizcn", "hichina",
	]

	# ---------------------------------------------------------------------------
	# Freemail Domains (reply-to freemail = social engineering signal)
	# ---------------------------------------------------------------------------

	FREEMAIL_DOMAINS: List[str] = [
	"gmail.com", "yahoo.com", "hotmail.com", "outlook.com",
	"aol.com", "protonmail.com", "mail.com", "icloud.com",
	"yandex.com", "tutanota.com", "gmx.com", "live.com",
	]

	# ---------------------------------------------------------------------------
	# MITRE ATT&CK Technique Mapping
	# ---------------------------------------------------------------------------

	ATTACK_TECHNIQUE_MAP: Dict[str, Dict[str, str]] = {
	# Keys = exact feature names from the ML feature vector
	"url_url_shortener_max": {
	"technique_id": "T1566.002",
	"technique_name": "Phishing: Spearphishing Link",
	"tactic": "Initial Access",
	},
	"url_domain_age_risk": {
	"technique_id": "T1583.001",
	"technique_name": "Acquire Infrastructure: Domains",
	"tactic": "Resource Development",
	},
	"url_punycode_detected_max": {
	"technique_id": "T1036.007",
	"technique_name": "Masquerading: Double File Extension",
	"tactic": "Defense Evasion",
	},
	"html_external_form_action": {
	"technique_id": "T1056.003",
	"technique_name": "Input Capture: Web Portal Capture",
	"tactic": "Collection",
	},
	"url_cert_brand_mismatch": {
	"technique_id": "T1036",
	"technique_name": "Masquerading",
	"tactic": "Defense Evasion",
	},
	"hdr_received_geo_anomaly": {
	"technique_id": "T1071.003",
	"technique_name": "Application Layer Protocol: Mail Protocols",
	"tactic": "Command and Control",
	},
	"url_is_ip_address_max": {
	"technique_id": "T1583.005",
	"technique_name": "Acquire Infrastructure: Botnet",
	"tactic": "Resource Development",
	},
	"hdr_from_reply_to_mismatch": {
	"technique_id": "T1656",
	"technique_name": "Impersonation",
	"tactic": "Defense Evasion",
	},
	}

	# ---------------------------------------------------------------------------
	# Model Hyperparameter Grids (Optuna search space)
	# ---------------------------------------------------------------------------

	XGBOOST_PARAM_GRID: Dict[str, Any] = {
	"learning_rate": (0.01, 0.3), # log-uniform
	"max_depth": (3, 10), # int
	"n_estimators": (100, 1000), # int
	"subsample": (0.6, 1.0),
	"colsample_bytree": (0.6, 1.0),
	"min_child_weight": (1, 10), # int
	"gamma": (0.0, 0.5),
	}

	RF_PARAM_GRID: Dict[str, Any] = {
	"n_estimators": (100, 500), # int
	"max_depth": (5, 30), # int; None = unlimited → use high int
	"min_samples_split": (2, 20), # int
	"max_features": ["sqrt", "log2"],
	}

	LR_PARAM_GRID: Dict[str, Any] = {
	"C": (0.001, 100.0), # log-uniform
	"solver": ["lbfgs", "saga"],
	"max_iter": [500, 1000, 2000],
	}

	CATBOOST_PARAM_GRID: Dict[str, Any] = {
	"iterations": (200, 1000),
	"learning_rate": (0.01, 0.3),
	"depth": (4, 10),
	"l2_leaf_reg": (1, 10),
	}

	# ---------------------------------------------------------------------------
	# API Endpoints
	# ---------------------------------------------------------------------------

	API_ENDPOINTS: Dict[str, str] = {
	"virustotal_url": "https://www.virustotal.com/api/v3/urls/{url_id}",
	"virustotal_submit": "https://www.virustotal.com/api/v3/urls",
	"google_safe_browsing": "https://safebrowsing.googleapis.com/v4/threatMatches:find",
	"abuseipdb_check": "https://api.abuseipdb.com/api/v2/check",
	"urlscan_search": "https://urlscan.io/api/v1/search/",
	"urlscan_submit": "https://urlscan.io/api/v1/scan/",
	"urlscan_result": "https://urlscan.io/api/v1/result/{uuid}/",
	"crtsh": "https://crt.sh/?q={domain}&output=json",
	"urlhaus_lookup": "https://urlhaus-api.abuse.ch/v1/url/",
	}

	# ---------------------------------------------------------------------------
	# Config Dataclass (passed around the codebase)
	# ---------------------------------------------------------------------------


	@dataclass
	class PhishLensConfig:
	"""Central configuration object for PhishLens.

	Instantiate once and pass to all modules that need thresholds or params.
	"""

	random_state: int = RANDOM_STATE
	test_size: float = TEST_SIZE
	cv_folds: int = CV_FOLDS
	optuna_trials: int = OPTUNA_TRIALS
	embedding_model: str = EMBEDDING_MODEL
	tfidf_max_features: int = TFIDF_MAX_FEATURES
	tfidf_ngram_range: tuple = field(default_factory=lambda: TFIDF_NGRAM_RANGE)
	domain_age_risk_days: int = DOMAIN_AGE_RISK_DAYS
	domain_age_warn_days: int = DOMAIN_AGE_WARN_DAYS
	min_url_entropy: float = MIN_URL_ENTROPY
	anomaly_contamination: float = ANOMALY_CONTAMINATION
	whois_timeout: int = WHOIS_TIMEOUT
	network_timeout: int = NETWORK_TIMEOUT
	embedding_max_tokens: int = EMBEDDING_MAX_TOKENS
	brand_list: List[str] = field(default_factory=lambda: BRAND_LIST)
	risk_tld_list: List[str] = field(default_factory=lambda: RISK_TLD_LIST)
	safe_tld_list: List[str] = field(default_factory=lambda: SAFE_TLD_LIST)
	url_shortener_domains: List[str] = field(default_factory=lambda: URL_SHORTENER_DOMAINS)
	suspicious_url_keywords: List[str] = field(default_factory=lambda: SUSPICIOUS_URL_KEYWORDS)
	urgency_phrases: List[str] = field(default_factory=lambda: URGENCY_PHRASES)
	freemail_domains: List[str] = field(default_factory=lambda: FREEMAIL_DOMAINS)
	abuse_registrars: List[str] = field(default_factory=lambda: ABUSE_REGISTRARS)
	prediction_threshold: float = 0.5

	def __repr__(self) -> str:
	return (
	f"PhishLensConfig("
	f"random_state={self.random_state}, "
	f"cv_folds={self.cv_folds}, "
	f"embedding_model='{self.embedding_model}', "
	f"optuna_trials={self.optuna_trials})"
	)


	# Singleton default config — import this directly where no customisation needed
	DEFAULT_CONFIG = PhishLensConfig()