Spaces:

elecie
/

PhishGuard

Runtime error

File size: 4,302 Bytes

import re
import tldextract
from rapidfuzz import fuzz
import gradio as gr

# --- Labels & Regex ---
LABELS = ["urgent", "fear", "authority", "financial scam", "safe"]

CUES = {
    "urgency": [r"\burgent\b", r"\bimmediately\b", r"\bverify now\b", r"\blimited time\b"],
    "fear": [r"\bsuspended\b", r"\block(ed)?\b", r"\blegal action\b", r"\bunauthorized\b"],
    "authority": [r"\bCEO\b", r"\badmin\b", r"\bIT support\b", r"\bgovernment\b"],
    "financial": [r"\bwin\b", r"\bprize\b", r"\blottery\b", r"\binvestment\b"]
}

TRUSTED_DOMAINS = ["google.com", "paypal.com", "microsoft.com", "amazon.com", "facebook.com", "apple.com"]
SUSPICIOUS_TLDS = ["xyz", "top", "tk", "gq", "cf", "ml"]
URL_PATTERN = re.compile(r"(https?://[^\s]+|www\.[^\s]+|\b[a-zA-Z0-9-]+\.[a-z]{2,}\b)")

# --- Lazy-load Hugging Face model ---
classifier = None
def get_classifier():
    global classifier
    if classifier is None:
        from transformers import pipeline
        classifier = pipeline(
            "zero-shot-classification",
            model="valhalla/distilbart-mnli-12-1",
            device=-1  # CPU
        )
    return classifier

# --- Analysis functions ---
def regex_analysis(text):
    findings = []
    score = 0
    for category, patterns in CUES.items():
        for pat in patterns:
            if re.search(pat, text, re.IGNORECASE):
                findings.append(f"{category.capitalize()} cue detected: '{pat.strip(r'\\b')}'")
                score += 20
    return score, findings

def huggingface_analysis(text):
    clf = get_classifier()
    result = clf(text, LABELS)
    label_scores = list(zip(result["labels"], result["scores"]))
    label_scores.sort(key=lambda x: x[1], reverse=True)

    top_two = label_scores[:2]
    findings = [f"HuggingFace: {label} (confidence {score:.2f})" for label, score in top_two if label != "safe"]
    hf_score = sum(int(score * 30) for label, score in top_two if label != "safe")

    return hf_score, findings

def url_analysis(url):
    findings = []
    score = 0
    ext = tldextract.extract(url)
    domain = f"{ext.domain}.{ext.suffix}"

    if not url.lower().startswith("https://"):
        findings.append("Non-HTTPS connection detected")
        score += 25

    if ext.suffix in SUSPICIOUS_TLDS:
        findings.append(f"Suspicious TLD detected: .{ext.suffix}")
        score += 20

    if re.match(r"^https?://\d+\.\d+\.\d+\.\d+", url):
        findings.append("IP address used instead of domain")
        score += 30

    for trusted in TRUSTED_DOMAINS:
        similarity = fuzz.ratio(domain, trusted)
        if similarity > 80 and domain != trusted:
            findings.append(f"Possible typosquatting: {domain} vs {trusted} (similarity {similarity}%)")
            score += 30
            break

    return score, findings

def extract_url_from_text(text):
    match = URL_PATTERN.search(text)
    return match.group(0) if match else None

# --- Main analyze function for Gradio ---
def analyze(text):
    regex_score, regex_findings = regex_analysis(text)
    hf_score, hf_findings = huggingface_analysis(text)
    text_score = regex_score + hf_score

    url = extract_url_from_text(text)
    url_score, url_findings = (0, [])
    if url:
        url_score, url_findings = url_analysis(url)

    text_score = min(text_score, 100) * 0.4
    url_score = min(url_score, 100) * 0.6
    total_score = min(100, int(text_score + url_score))

    reasons = regex_findings + hf_findings + url_findings

    if total_score < 30:
        risk_level = "Low"
    elif total_score < 70:
        risk_level = "Medium"
    else:
        risk_level = "High"

    return {
        "Score": total_score,
        "Risk Level": risk_level,
        "Reasons": "\n".join(reasons),
        "Extracted URL": url if url else "None detected"
    }

# --- Gradio Interface ---
iface = gr.Interface(
    fn=analyze,
    inputs=gr.Textbox(lines=5, placeholder="Paste text here..."),
    outputs=[
        gr.Textbox(label="Score"),
        gr.Textbox(label="Risk Level"),
        gr.Textbox(label="Reasons"),
        gr.Textbox(label="Extracted URL")
    ],
    title="Phishing / Scam Detector",
    description="Analyzes text for urgency, fear, authority, and financial scam cues, plus suspicious URLs."
)

if __name__ == "__main__":
    iface.launch()