PhishGuard / app.py
elecie's picture
Lazy load
466b21e
import re
import tldextract
from rapidfuzz import fuzz
import gradio as gr
# --- Labels & Regex ---
LABELS = ["urgent", "fear", "authority", "financial scam", "safe"]
CUES = {
"urgency": [r"\burgent\b", r"\bimmediately\b", r"\bverify now\b", r"\blimited time\b"],
"fear": [r"\bsuspended\b", r"\block(ed)?\b", r"\blegal action\b", r"\bunauthorized\b"],
"authority": [r"\bCEO\b", r"\badmin\b", r"\bIT support\b", r"\bgovernment\b"],
"financial": [r"\bwin\b", r"\bprize\b", r"\blottery\b", r"\binvestment\b"]
}
TRUSTED_DOMAINS = ["google.com", "paypal.com", "microsoft.com", "amazon.com", "facebook.com", "apple.com"]
SUSPICIOUS_TLDS = ["xyz", "top", "tk", "gq", "cf", "ml"]
URL_PATTERN = re.compile(r"(https?://[^\s]+|www\.[^\s]+|\b[a-zA-Z0-9-]+\.[a-z]{2,}\b)")
# --- Lazy-load Hugging Face model ---
classifier = None
def get_classifier():
global classifier
if classifier is None:
from transformers import pipeline
classifier = pipeline(
"zero-shot-classification",
model="valhalla/distilbart-mnli-12-1",
device=-1 # CPU
)
return classifier
# --- Analysis functions ---
def regex_analysis(text):
findings = []
score = 0
for category, patterns in CUES.items():
for pat in patterns:
if re.search(pat, text, re.IGNORECASE):
findings.append(f"{category.capitalize()} cue detected: '{pat.strip(r'\\b')}'")
score += 20
return score, findings
def huggingface_analysis(text):
clf = get_classifier()
result = clf(text, LABELS)
label_scores = list(zip(result["labels"], result["scores"]))
label_scores.sort(key=lambda x: x[1], reverse=True)
top_two = label_scores[:2]
findings = [f"HuggingFace: {label} (confidence {score:.2f})" for label, score in top_two if label != "safe"]
hf_score = sum(int(score * 30) for label, score in top_two if label != "safe")
return hf_score, findings
def url_analysis(url):
findings = []
score = 0
ext = tldextract.extract(url)
domain = f"{ext.domain}.{ext.suffix}"
if not url.lower().startswith("https://"):
findings.append("Non-HTTPS connection detected")
score += 25
if ext.suffix in SUSPICIOUS_TLDS:
findings.append(f"Suspicious TLD detected: .{ext.suffix}")
score += 20
if re.match(r"^https?://\d+\.\d+\.\d+\.\d+", url):
findings.append("IP address used instead of domain")
score += 30
for trusted in TRUSTED_DOMAINS:
similarity = fuzz.ratio(domain, trusted)
if similarity > 80 and domain != trusted:
findings.append(f"Possible typosquatting: {domain} vs {trusted} (similarity {similarity}%)")
score += 30
break
return score, findings
def extract_url_from_text(text):
match = URL_PATTERN.search(text)
return match.group(0) if match else None
# --- Main analyze function for Gradio ---
def analyze(text):
regex_score, regex_findings = regex_analysis(text)
hf_score, hf_findings = huggingface_analysis(text)
text_score = regex_score + hf_score
url = extract_url_from_text(text)
url_score, url_findings = (0, [])
if url:
url_score, url_findings = url_analysis(url)
text_score = min(text_score, 100) * 0.4
url_score = min(url_score, 100) * 0.6
total_score = min(100, int(text_score + url_score))
reasons = regex_findings + hf_findings + url_findings
if total_score < 30:
risk_level = "Low"
elif total_score < 70:
risk_level = "Medium"
else:
risk_level = "High"
return {
"Score": total_score,
"Risk Level": risk_level,
"Reasons": "\n".join(reasons),
"Extracted URL": url if url else "None detected"
}
# --- Gradio Interface ---
iface = gr.Interface(
fn=analyze,
inputs=gr.Textbox(lines=5, placeholder="Paste text here..."),
outputs=[
gr.Textbox(label="Score"),
gr.Textbox(label="Risk Level"),
gr.Textbox(label="Reasons"),
gr.Textbox(label="Extracted URL")
],
title="Phishing / Scam Detector",
description="Analyzes text for urgency, fear, authority, and financial scam cues, plus suspicious URLs."
)
if __name__ == "__main__":
iface.launch()