Spaces:
Sleeping
Sleeping
| import os, re | |
| import gradio as gr | |
| # Keep Transformers quiet & CPU-only friendly | |
| os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") | |
| # -------- Config -------- | |
| URL_MODEL_ID = "CrabInHoney/urlbert-tiny-v4-malicious-url-classifier" | |
| URL_LABEL_MAP = { | |
| "LABEL_0": "benign", | |
| "LABEL_1": "defacement", | |
| "LABEL_2": "malware", | |
| "LABEL_3": "phishing", | |
| } | |
| URL_RE = re.compile(r"""(?xi)\b(?:https?://|www\.)[a-z0-9\-._~%]+(?:/[^\s<>"']*)?""") | |
| _pipe = None # created on first analyze() | |
| def _extract_urls(t: str): | |
| return sorted(set(m.group(0) for m in URL_RE.finditer(t or ""))) | |
| def _pretty(raw, id2label): | |
| if id2label: | |
| if raw in id2label: | |
| return id2label[raw] | |
| k = raw.replace("LABEL_", "") | |
| if k in id2label: | |
| return id2label[k] | |
| return URL_LABEL_MAP.get(raw, raw) | |
| def analyze(text: str) -> str: | |
| text = (text or "").strip() | |
| if not text: | |
| return "Paste an email body or a URL." | |
| # Use single-URL mode if it looks like one; else extract from email text | |
| urls = [text] if (text.lower().startswith(("http://","https://","www.")) and " " not in text) else _extract_urls(text) | |
| if not urls: | |
| return "No URLs detected in the text." | |
| # Lazy import + pipeline creation keeps startup instant | |
| global _pipe | |
| if _pipe is None: | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline | |
| tok = AutoTokenizer.from_pretrained(URL_MODEL_ID) | |
| mdl = AutoModelForSequenceClassification.from_pretrained(URL_MODEL_ID) | |
| _pipe = pipeline("text-classification", model=mdl, tokenizer=tok, device=-1, top_k=None) | |
| id2label = getattr(_pipe.model.config, "id2label", None) | |
| lines = [] | |
| unsafe = False | |
| for u in urls: | |
| scores = sorted(_pipe(u)[0], key=lambda s: s["score"], reverse=True) | |
| top = scores[0] | |
| lbl = _pretty(top["label"], id2label) | |
| conf = 100 * float(top["score"]) | |
| lines.append(f"- **{u}** → **{lbl}** ({conf:.2f}%)") | |
| if lbl.lower() in {"phishing", "malware", "defacement"}: | |
| unsafe = True | |
| verdict = "🔴 **UNSAFE (links flagged)**" if unsafe else "🟢 **SAFE (all links benign)**" | |
| return verdict + "\n\n" + "\n".join(lines) | |
| demo = gr.Interface( | |
| fn=analyze, | |
| inputs=gr.Textbox(lines=6, label="Email or URL", placeholder="Paste a URL or a full email…"), | |
| outputs=gr.Markdown(label="Result"), | |
| title="🛡️ Phishing Detector (via Link Analysis)", | |
| description="We extract links and classify each with a compact malicious-URL model (CPU-only, free tier).", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |