File size: 4,302 Bytes
cfd1501
 
 
466b21e
4737fbd
466b21e
4737fbd
 
 
cfd1501
 
 
 
4737fbd
 
cfd1501
 
 
 
466b21e
 
 
 
 
 
 
 
 
 
 
 
 
 
cfd1501
466b21e
 
cfd1501
 
 
 
4737fbd
 
 
cfd1501
466b21e
 
cfd1501
 
 
 
 
 
 
 
 
 
466b21e
 
cfd1501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4737fbd
 
cfd1501
 
 
 
466b21e
cfd1501
53af713
cfd1501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7690858
cfd1501
 
466b21e
cfd1501
7690858
 
466b21e
cfd1501
 
466b21e
 
 
 
 
 
 
 
 
4737fbd
 
 
cfd1501
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import re
import tldextract
from rapidfuzz import fuzz
import gradio as gr

# --- Labels & Regex ---
LABELS = ["urgent", "fear", "authority", "financial scam", "safe"]

CUES = {
    "urgency": [r"\burgent\b", r"\bimmediately\b", r"\bverify now\b", r"\blimited time\b"],
    "fear": [r"\bsuspended\b", r"\block(ed)?\b", r"\blegal action\b", r"\bunauthorized\b"],
    "authority": [r"\bCEO\b", r"\badmin\b", r"\bIT support\b", r"\bgovernment\b"],
    "financial": [r"\bwin\b", r"\bprize\b", r"\blottery\b", r"\binvestment\b"]
}

TRUSTED_DOMAINS = ["google.com", "paypal.com", "microsoft.com", "amazon.com", "facebook.com", "apple.com"]
SUSPICIOUS_TLDS = ["xyz", "top", "tk", "gq", "cf", "ml"]
URL_PATTERN = re.compile(r"(https?://[^\s]+|www\.[^\s]+|\b[a-zA-Z0-9-]+\.[a-z]{2,}\b)")

# --- Lazy-load Hugging Face model ---
classifier = None
def get_classifier():
    global classifier
    if classifier is None:
        from transformers import pipeline
        classifier = pipeline(
            "zero-shot-classification",
            model="valhalla/distilbart-mnli-12-1",
            device=-1  # CPU
        )
    return classifier

# --- Analysis functions ---
def regex_analysis(text):
    findings = []
    score = 0
    for category, patterns in CUES.items():
        for pat in patterns:
            if re.search(pat, text, re.IGNORECASE):
                findings.append(f"{category.capitalize()} cue detected: '{pat.strip(r'\\b')}'")
                score += 20
    return score, findings

def huggingface_analysis(text):
    clf = get_classifier()
    result = clf(text, LABELS)
    label_scores = list(zip(result["labels"], result["scores"]))
    label_scores.sort(key=lambda x: x[1], reverse=True)

    top_two = label_scores[:2]
    findings = [f"HuggingFace: {label} (confidence {score:.2f})" for label, score in top_two if label != "safe"]
    hf_score = sum(int(score * 30) for label, score in top_two if label != "safe")

    return hf_score, findings

def url_analysis(url):
    findings = []
    score = 0
    ext = tldextract.extract(url)
    domain = f"{ext.domain}.{ext.suffix}"

    if not url.lower().startswith("https://"):
        findings.append("Non-HTTPS connection detected")
        score += 25

    if ext.suffix in SUSPICIOUS_TLDS:
        findings.append(f"Suspicious TLD detected: .{ext.suffix}")
        score += 20

    if re.match(r"^https?://\d+\.\d+\.\d+\.\d+", url):
        findings.append("IP address used instead of domain")
        score += 30

    for trusted in TRUSTED_DOMAINS:
        similarity = fuzz.ratio(domain, trusted)
        if similarity > 80 and domain != trusted:
            findings.append(f"Possible typosquatting: {domain} vs {trusted} (similarity {similarity}%)")
            score += 30
            break

    return score, findings

def extract_url_from_text(text):
    match = URL_PATTERN.search(text)
    return match.group(0) if match else None

# --- Main analyze function for Gradio ---
def analyze(text):
    regex_score, regex_findings = regex_analysis(text)
    hf_score, hf_findings = huggingface_analysis(text)
    text_score = regex_score + hf_score

    url = extract_url_from_text(text)
    url_score, url_findings = (0, [])
    if url:
        url_score, url_findings = url_analysis(url)

    text_score = min(text_score, 100) * 0.4
    url_score = min(url_score, 100) * 0.6
    total_score = min(100, int(text_score + url_score))

    reasons = regex_findings + hf_findings + url_findings

    if total_score < 30:
        risk_level = "Low"
    elif total_score < 70:
        risk_level = "Medium"
    else:
        risk_level = "High"

    return {
        "Score": total_score,
        "Risk Level": risk_level,
        "Reasons": "\n".join(reasons),
        "Extracted URL": url if url else "None detected"
    }

# --- Gradio Interface ---
iface = gr.Interface(
    fn=analyze,
    inputs=gr.Textbox(lines=5, placeholder="Paste text here..."),
    outputs=[
        gr.Textbox(label="Score"),
        gr.Textbox(label="Risk Level"),
        gr.Textbox(label="Reasons"),
        gr.Textbox(label="Extracted URL")
    ],
    title="Phishing / Scam Detector",
    description="Analyzes text for urgency, fear, authority, and financial scam cues, plus suspicious URLs."
)

if __name__ == "__main__":
    iface.launch()