import re import tldextract from rapidfuzz import fuzz import gradio as gr # --- Labels & Regex --- LABELS = ["urgent", "fear", "authority", "financial scam", "safe"] CUES = { "urgency": [r"\burgent\b", r"\bimmediately\b", r"\bverify now\b", r"\blimited time\b"], "fear": [r"\bsuspended\b", r"\block(ed)?\b", r"\blegal action\b", r"\bunauthorized\b"], "authority": [r"\bCEO\b", r"\badmin\b", r"\bIT support\b", r"\bgovernment\b"], "financial": [r"\bwin\b", r"\bprize\b", r"\blottery\b", r"\binvestment\b"] } TRUSTED_DOMAINS = ["google.com", "paypal.com", "microsoft.com", "amazon.com", "facebook.com", "apple.com"] SUSPICIOUS_TLDS = ["xyz", "top", "tk", "gq", "cf", "ml"] URL_PATTERN = re.compile(r"(https?://[^\s]+|www\.[^\s]+|\b[a-zA-Z0-9-]+\.[a-z]{2,}\b)") # --- Lazy-load Hugging Face model --- classifier = None def get_classifier(): global classifier if classifier is None: from transformers import pipeline classifier = pipeline( "zero-shot-classification", model="valhalla/distilbart-mnli-12-1", device=-1 # CPU ) return classifier # --- Analysis functions --- def regex_analysis(text): findings = [] score = 0 for category, patterns in CUES.items(): for pat in patterns: if re.search(pat, text, re.IGNORECASE): findings.append(f"{category.capitalize()} cue detected: '{pat.strip(r'\\b')}'") score += 20 return score, findings def huggingface_analysis(text): clf = get_classifier() result = clf(text, LABELS) label_scores = list(zip(result["labels"], result["scores"])) label_scores.sort(key=lambda x: x[1], reverse=True) top_two = label_scores[:2] findings = [f"HuggingFace: {label} (confidence {score:.2f})" for label, score in top_two if label != "safe"] hf_score = sum(int(score * 30) for label, score in top_two if label != "safe") return hf_score, findings def url_analysis(url): findings = [] score = 0 ext = tldextract.extract(url) domain = f"{ext.domain}.{ext.suffix}" if not url.lower().startswith("https://"): findings.append("Non-HTTPS connection detected") score += 25 if ext.suffix in SUSPICIOUS_TLDS: findings.append(f"Suspicious TLD detected: .{ext.suffix}") score += 20 if re.match(r"^https?://\d+\.\d+\.\d+\.\d+", url): findings.append("IP address used instead of domain") score += 30 for trusted in TRUSTED_DOMAINS: similarity = fuzz.ratio(domain, trusted) if similarity > 80 and domain != trusted: findings.append(f"Possible typosquatting: {domain} vs {trusted} (similarity {similarity}%)") score += 30 break return score, findings def extract_url_from_text(text): match = URL_PATTERN.search(text) return match.group(0) if match else None # --- Main analyze function for Gradio --- def analyze(text): regex_score, regex_findings = regex_analysis(text) hf_score, hf_findings = huggingface_analysis(text) text_score = regex_score + hf_score url = extract_url_from_text(text) url_score, url_findings = (0, []) if url: url_score, url_findings = url_analysis(url) text_score = min(text_score, 100) * 0.4 url_score = min(url_score, 100) * 0.6 total_score = min(100, int(text_score + url_score)) reasons = regex_findings + hf_findings + url_findings if total_score < 30: risk_level = "Low" elif total_score < 70: risk_level = "Medium" else: risk_level = "High" return { "Score": total_score, "Risk Level": risk_level, "Reasons": "\n".join(reasons), "Extracted URL": url if url else "None detected" } # --- Gradio Interface --- iface = gr.Interface( fn=analyze, inputs=gr.Textbox(lines=5, placeholder="Paste text here..."), outputs=[ gr.Textbox(label="Score"), gr.Textbox(label="Risk Level"), gr.Textbox(label="Reasons"), gr.Textbox(label="Extracted URL") ], title="Phishing / Scam Detector", description="Analyzes text for urgency, fear, authority, and financial scam cues, plus suspicious URLs." ) if __name__ == "__main__": iface.launch()