Spaces:

elecie
/

PhishGuard

Runtime error

App Files Files Community

PhishGuard / app.py

elecie

Lazy load

466b21e 5 months ago

raw

history blame contribute delete

4.3 kB

	import re
	import tldextract
	from rapidfuzz import fuzz
	import gradio as gr

	# --- Labels & Regex ---
	LABELS = ["urgent", "fear", "authority", "financial scam", "safe"]

	CUES = {
	"urgency": [r"\burgent\b", r"\bimmediately\b", r"\bverify now\b", r"\blimited time\b"],
	"fear": [r"\bsuspended\b", r"\block(ed)?\b", r"\blegal action\b", r"\bunauthorized\b"],
	"authority": [r"\bCEO\b", r"\badmin\b", r"\bIT support\b", r"\bgovernment\b"],
	"financial": [r"\bwin\b", r"\bprize\b", r"\blottery\b", r"\binvestment\b"]
	}

	TRUSTED_DOMAINS = ["google.com", "paypal.com", "microsoft.com", "amazon.com", "facebook.com", "apple.com"]
	SUSPICIOUS_TLDS = ["xyz", "top", "tk", "gq", "cf", "ml"]
	URL_PATTERN = re.compile(r"(https?://[^\s]+\|www\.[^\s]+\|\b[a-zA-Z0-9-]+\.[a-z]{2,}\b)")

	# --- Lazy-load Hugging Face model ---
	classifier = None
	def get_classifier():
	global classifier
	if classifier is None:
	from transformers import pipeline
	classifier = pipeline(
	"zero-shot-classification",
	model="valhalla/distilbart-mnli-12-1",
	device=-1 # CPU
	)
	return classifier

	# --- Analysis functions ---
	def regex_analysis(text):
	findings = []
	score = 0
	for category, patterns in CUES.items():
	for pat in patterns:
	if re.search(pat, text, re.IGNORECASE):
	findings.append(f"{category.capitalize()} cue detected: '{pat.strip(r'\\b')}'")
	score += 20
	return score, findings

	def huggingface_analysis(text):
	clf = get_classifier()
	result = clf(text, LABELS)
	label_scores = list(zip(result["labels"], result["scores"]))
	label_scores.sort(key=lambda x: x[1], reverse=True)

	top_two = label_scores[:2]
	findings = [f"HuggingFace: {label} (confidence {score:.2f})" for label, score in top_two if label != "safe"]
	hf_score = sum(int(score * 30) for label, score in top_two if label != "safe")

	return hf_score, findings

	def url_analysis(url):
	findings = []
	score = 0
	ext = tldextract.extract(url)
	domain = f"{ext.domain}.{ext.suffix}"

	if not url.lower().startswith("https://"):
	findings.append("Non-HTTPS connection detected")
	score += 25

	if ext.suffix in SUSPICIOUS_TLDS:
	findings.append(f"Suspicious TLD detected: .{ext.suffix}")
	score += 20

	if re.match(r"^https?://\d+\.\d+\.\d+\.\d+", url):
	findings.append("IP address used instead of domain")
	score += 30

	for trusted in TRUSTED_DOMAINS:
	similarity = fuzz.ratio(domain, trusted)
	if similarity > 80 and domain != trusted:
	findings.append(f"Possible typosquatting: {domain} vs {trusted} (similarity {similarity}%)")
	score += 30
	break

	return score, findings

	def extract_url_from_text(text):
	match = URL_PATTERN.search(text)
	return match.group(0) if match else None

	# --- Main analyze function for Gradio ---
	def analyze(text):
	regex_score, regex_findings = regex_analysis(text)
	hf_score, hf_findings = huggingface_analysis(text)
	text_score = regex_score + hf_score

	url = extract_url_from_text(text)
	url_score, url_findings = (0, [])
	if url:
	url_score, url_findings = url_analysis(url)

	text_score = min(text_score, 100) * 0.4
	url_score = min(url_score, 100) * 0.6
	total_score = min(100, int(text_score + url_score))

	reasons = regex_findings + hf_findings + url_findings

	if total_score < 30:
	risk_level = "Low"
	elif total_score < 70:
	risk_level = "Medium"
	else:
	risk_level = "High"

	return {
	"Score": total_score,
	"Risk Level": risk_level,
	"Reasons": "\n".join(reasons),
	"Extracted URL": url if url else "None detected"
	}

	# --- Gradio Interface ---
	iface = gr.Interface(
	fn=analyze,
	inputs=gr.Textbox(lines=5, placeholder="Paste text here..."),
	outputs=[
	gr.Textbox(label="Score"),
	gr.Textbox(label="Risk Level"),
	gr.Textbox(label="Reasons"),
	gr.Textbox(label="Extracted URL")
	],
	title="Phishing / Scam Detector",
	description="Analyzes text for urgency, fear, authority, and financial scam cues, plus suspicious URLs."
	)

	if __name__ == "__main__":
	iface.launch()