Spaces:

ai-assist-sh
/

phishing-detector

Sleeping

App Files Files Community

ai-assist-sh commited on Aug 19

Commit

8547a14

verified ·

1 Parent(s): d4166be

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -5

app.py CHANGED Viewed

@@ -1,8 +1,68 @@
-import gradio as gr, sys
-def ping(name):
-    import gradio as g
-    return f"Hello, {name or 'world'}! (gradio {g.__version__}, py {sys.version.split()[0]})"
-demo = gr.Interface(ping, "text", "text", title="Ping")
 if __name__ == "__main__":
     demo.launch()

+import os, re
+import gradio as gr
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+URL_MODEL_ID = "CrabInHoney/urlbert-tiny-v4-malicious-url-classifier"
+URL_LABEL_MAP = {"LABEL_0":"benign","LABEL_1":"defacement","LABEL_2":"malware","LABEL_3":"phishing"}
+URL_RE = re.compile(r"""(?xi)\b(?:https?://|www\.)[a-z0-9\-._~%]+(?:/[^\s<>"']*)?""")
+_pipe = None  # created on first analyze()
+def _extract_urls(t: str):
+    return sorted(set(m.group(0) for m in URL_RE.finditer(t or "")))
+def _pretty(raw, id2label):
+    if id2label:
+        if raw in id2label: return id2label[raw]
+        k = raw.replace("LABEL_","")
+        if k in id2label: return id2label[k]
+    return URL_LABEL_MAP.get(raw, raw)
+def analyze(text: str):
+    text = (text or "").strip()
+    if not text:
+        return "Paste an email body or a URL.", "", "", []
+    urls = [text] if (text.lower().startswith(("http://","https://","www.")) and " " not in text) else _extract_urls(text)
+    if not urls:
+        return "No URLs detected in the text.", "", "", []
+    global _pipe
+    if _pipe is None:
+        from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+        tok = AutoTokenizer.from_pretrained(URL_MODEL_ID)
+        mdl = AutoModelForSequenceClassification.from_pretrained(URL_MODEL_ID)
+        _pipe = pipeline("text-classification", model=mdl, tokenizer=tok, device=-1, top_k=None)
+    id2label = getattr(_pipe.model.config, "id2label", None)
+    rows, unsafe, top_label, top_conf = [], False, "", ""
+    for i, u in enumerate(urls, 1):
+        scores = sorted(_pipe(u)[0], key=lambda s: s["score"], reverse=True)
+        top = scores[0]
+        lbl = _pretty(top["label"], id2label)
+        conf = round(100*float(top["score"]), 2)
+        rows.append([u, lbl, conf])
+        if i == 1:
+            top_label, top_conf = lbl, f"{conf:.2f}%"
+        if lbl.lower() in {"phishing","malware","defacement"}:
+            unsafe = True
+    verdict = "🔴 UNSAFE (links flagged)" if unsafe else "🟢 SAFE (all links benign)"
+    return verdict, top_label, top_conf, rows
+demo = gr.Interface(
+    fn=analyze,
+    inputs=gr.Textbox(lines=6, label="Email or URL"),
+    outputs=[
+        gr.Markdown(label="Verdict"),
+        gr.Textbox(label="Prediction", interactive=False),
+        gr.Textbox(label="Confidence", interactive=False),
+        gr.Dataframe(headers=["URL","Prediction","Confidence (%)"], datatype=["str","str","number"],
+                     row_count=(0,"dynamic"), col_count=(3,"fixed"), interactive=False, label="Per-link results")
+    ],
+    title="🛡️ Phishing Detector (via Link Analysis)",
+    description="We extract links and classify each with a compact malicious-URL model."
+)
 if __name__ == "__main__":
     demo.launch()