Spaces:
Runtime error
Runtime error
File size: 4,302 Bytes
cfd1501 466b21e 4737fbd 466b21e 4737fbd cfd1501 4737fbd cfd1501 466b21e cfd1501 466b21e cfd1501 4737fbd cfd1501 466b21e cfd1501 466b21e cfd1501 4737fbd cfd1501 466b21e cfd1501 53af713 cfd1501 7690858 cfd1501 466b21e cfd1501 7690858 466b21e cfd1501 466b21e 4737fbd cfd1501 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | import re
import tldextract
from rapidfuzz import fuzz
import gradio as gr
# --- Labels & Regex ---
LABELS = ["urgent", "fear", "authority", "financial scam", "safe"]
CUES = {
"urgency": [r"\burgent\b", r"\bimmediately\b", r"\bverify now\b", r"\blimited time\b"],
"fear": [r"\bsuspended\b", r"\block(ed)?\b", r"\blegal action\b", r"\bunauthorized\b"],
"authority": [r"\bCEO\b", r"\badmin\b", r"\bIT support\b", r"\bgovernment\b"],
"financial": [r"\bwin\b", r"\bprize\b", r"\blottery\b", r"\binvestment\b"]
}
TRUSTED_DOMAINS = ["google.com", "paypal.com", "microsoft.com", "amazon.com", "facebook.com", "apple.com"]
SUSPICIOUS_TLDS = ["xyz", "top", "tk", "gq", "cf", "ml"]
URL_PATTERN = re.compile(r"(https?://[^\s]+|www\.[^\s]+|\b[a-zA-Z0-9-]+\.[a-z]{2,}\b)")
# --- Lazy-load Hugging Face model ---
classifier = None
def get_classifier():
global classifier
if classifier is None:
from transformers import pipeline
classifier = pipeline(
"zero-shot-classification",
model="valhalla/distilbart-mnli-12-1",
device=-1 # CPU
)
return classifier
# --- Analysis functions ---
def regex_analysis(text):
findings = []
score = 0
for category, patterns in CUES.items():
for pat in patterns:
if re.search(pat, text, re.IGNORECASE):
findings.append(f"{category.capitalize()} cue detected: '{pat.strip(r'\\b')}'")
score += 20
return score, findings
def huggingface_analysis(text):
clf = get_classifier()
result = clf(text, LABELS)
label_scores = list(zip(result["labels"], result["scores"]))
label_scores.sort(key=lambda x: x[1], reverse=True)
top_two = label_scores[:2]
findings = [f"HuggingFace: {label} (confidence {score:.2f})" for label, score in top_two if label != "safe"]
hf_score = sum(int(score * 30) for label, score in top_two if label != "safe")
return hf_score, findings
def url_analysis(url):
findings = []
score = 0
ext = tldextract.extract(url)
domain = f"{ext.domain}.{ext.suffix}"
if not url.lower().startswith("https://"):
findings.append("Non-HTTPS connection detected")
score += 25
if ext.suffix in SUSPICIOUS_TLDS:
findings.append(f"Suspicious TLD detected: .{ext.suffix}")
score += 20
if re.match(r"^https?://\d+\.\d+\.\d+\.\d+", url):
findings.append("IP address used instead of domain")
score += 30
for trusted in TRUSTED_DOMAINS:
similarity = fuzz.ratio(domain, trusted)
if similarity > 80 and domain != trusted:
findings.append(f"Possible typosquatting: {domain} vs {trusted} (similarity {similarity}%)")
score += 30
break
return score, findings
def extract_url_from_text(text):
match = URL_PATTERN.search(text)
return match.group(0) if match else None
# --- Main analyze function for Gradio ---
def analyze(text):
regex_score, regex_findings = regex_analysis(text)
hf_score, hf_findings = huggingface_analysis(text)
text_score = regex_score + hf_score
url = extract_url_from_text(text)
url_score, url_findings = (0, [])
if url:
url_score, url_findings = url_analysis(url)
text_score = min(text_score, 100) * 0.4
url_score = min(url_score, 100) * 0.6
total_score = min(100, int(text_score + url_score))
reasons = regex_findings + hf_findings + url_findings
if total_score < 30:
risk_level = "Low"
elif total_score < 70:
risk_level = "Medium"
else:
risk_level = "High"
return {
"Score": total_score,
"Risk Level": risk_level,
"Reasons": "\n".join(reasons),
"Extracted URL": url if url else "None detected"
}
# --- Gradio Interface ---
iface = gr.Interface(
fn=analyze,
inputs=gr.Textbox(lines=5, placeholder="Paste text here..."),
outputs=[
gr.Textbox(label="Score"),
gr.Textbox(label="Risk Level"),
gr.Textbox(label="Reasons"),
gr.Textbox(label="Extracted URL")
],
title="Phishing / Scam Detector",
description="Analyzes text for urgency, fear, authority, and financial scam cues, plus suspicious URLs."
)
if __name__ == "__main__":
iface.launch()
|