Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 3, 2025

Commit

711ac8e

verified ·

1 Parent(s): 9dfcd1b

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -19

app.py CHANGED Viewed

@@ -1,13 +1,72 @@
 def _normalize_label_name(name: str) -> str:
     if not isinstance(name, str):
         return ""
-    return name.strip().lower()
 def _resolve_indices_from_config():
     # Returns (phish_idx, legit_idx) using model-config names and sensible fallbacks
-    cfg = getattr(_model, "config", None)
-    id2label = getattr(cfg, "id2label", {}) if cfg else {}
-    # Normalize keys to int
     norm = {}
     for k, v in id2label.items():
         try:
@@ -16,28 +75,103 @@ def _resolve_indices_from_config():
             continue
         norm[ik] = _normalize_label_name(v)
-    # Try to detect via keywords
-    phish_keywords = {"phish", "phishing", "spam", "scam", "malicious"}
-    legit_keywords = {"legit", "ham", "safe", "benign", "not phish", "non-phish"}
     phish_idx = None
     legit_idx = None
     for i, name in norm.items():
-        if any(kw in name for kw in phish_keywords):
-            phish_idx = i if phish_idx is None else phish_idx
-        if any(kw in name for kw in legit_keywords):
-            legit_idx = i if legit_idx is None else legit_idx
-    # Fallback conventions for binary heads
-    if phish_idx is None or legit_idx is None:
-        if len(norm) == 2:
-            # Common convention: 0 = negative(legit), 1 = positive(phish)
             phish_idx = 1 if phish_idx is None else phish_idx
             legit_idx = 0 if legit_idx is None else legit_idx
     return phish_idx, legit_idx
-def _label_for_index(idx: int) -> str:
     cfg = getattr(_model, "config", None)
-    id2label = getattr(cfg, "id2label", {}) if cfg else {}
-    return id2label.get(idx, id2label.get(str(idx), str(idx)))

+import os
+os.environ.setdefault("HOME", "/data")
+os.environ.setdefault("XDG_CACHE_HOME", "/data/.cache")
+os.environ.setdefault("HF_HOME", "/data/.cache")
+os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
+os.environ.setdefault("TORCH_HOME", "/data/.cache")
+from fastapi import FastAPI
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+MODEL_ID = os.environ.get("MODEL_ID", "Perth0603/phishing-email-mobilebert")
+# Ensure writable cache directory for HF/torch inside Spaces Docker
+CACHE_DIR = os.environ.get("HF_CACHE_DIR", "/data/.cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+# Decision threshold for PHISH probability
+PHISH_THRESHOLD = float(os.environ.get("PHISH_THRESHOLD", "0.5"))
+app = FastAPI(title="Phishing Text Classifier", version="1.0.0")
+class PredictPayload(BaseModel):
+    inputs: str
+# Lazy singletons for model/tokenizer
+_tokenizer = None
+_model = None
+def _load_model():
+    global _tokenizer, _model
+    if _tokenizer is None or _model is None:
+        _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
+        _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
+        _model.eval()  # inference mode
+        # Warm-up
+        with torch.no_grad():
+            _ = _model(**_tokenizer(["warm up"], return_tensors="pt")).logits
 def _normalize_label_name(name: str) -> str:
     if not isinstance(name, str):
         return ""
+    return name.strip().lower().replace("_", " ")
+def _id2label_map():
+    cfg = getattr(_model, "config", None)
+    return getattr(cfg, "id2label", {}) if cfg else {}
+def _label_for_index(idx: int) -> str:
+    id2label = _id2label_map()
+    return id2label.get(idx, id2label.get(str(idx), str(idx)))
 def _resolve_indices_from_config():
     # Returns (phish_idx, legit_idx) using model-config names and sensible fallbacks
+    id2label = _id2label_map()
+    if not isinstance(id2label, dict):
+        id2label = {}
+    # Normalize to int keys when possible
     norm = {}
     for k, v in id2label.items():
         try:
             continue
         norm[ik] = _normalize_label_name(v)
+    phish_keywords = {"phish", "phishing", "spam", "scam", "malicious", "fraud"}
+    legit_keywords = {"legit", "ham", "safe", "benign", "not phish", "non phish", "clean"}
     phish_idx = None
     legit_idx = None
     for i, name in norm.items():
+        if any(kw in name for kw in phish_keywords) and phish_idx is None:
+            phish_idx = i
+        if any(kw in name for kw in legit_keywords) and legit_idx is None:
+            legit_idx = i
+    # Fallback for common binary convention when labels aren't informative
+    if (phish_idx is None or legit_idx is None) and len(norm) == 2:
+        # Many binary heads: 0 = negative(legit), 1 = positive(phish)
+        phish_idx = 1 if phish_idx is None else phish_idx
+        legit_idx = 0 if legit_idx is None else legit_idx
+    # If id2label was empty but model is binary, still fallback to (1,0)
+    if not norm:
+        cfg = getattr(_model, "config", None)
+        num_labels = int(getattr(cfg, "num_labels", 2)) if cfg else 2
+        if num_labels == 2:
             phish_idx = 1 if phish_idx is None else phish_idx
             legit_idx = 0 if legit_idx is None else legit_idx
     return phish_idx, legit_idx
+def _probs_dict(probs_list):
+    out = {}
+    for i, p in enumerate(probs_list):
+        out[_label_for_index(i)] = float(p)
+    return out
+@app.get("/")
+def root():
+    _load_model()
+    cfg = getattr(_model, "config", None)
+    return {
+        "status": "ok",
+        "model": MODEL_ID,
+        "num_labels": int(getattr(cfg, "num_labels", 2)) if cfg else 2,
+    }
+@app.get("/labels")
+def labels():
+    _load_model()
     cfg = getattr(_model, "config", None)
+    return {
+        "id2label": getattr(cfg, "id2label", {}) if cfg else {},
+        "label2id": getattr(cfg, "label2id", {}) if cfg else {},
+    }
+@app.post("/predict")
+def predict(payload: PredictPayload):
+    try:
+        _load_model()
+        with torch.no_grad():
+            inputs = _tokenizer(
+                [payload.inputs],
+                return_tensors="pt",
+                truncation=True,
+                max_length=512
+            )
+            outputs = _model(**inputs)
+            logits = outputs.logits  # [1, num_labels]
+            probs_t = torch.softmax(logits, dim=-1)[0]  # [num_labels]
+            probs_list = probs_t.tolist()
+            argmax_idx = int(torch.argmax(probs_t).item())
+        phish_idx, legit_idx = _resolve_indices_from_config()
+        # Compute PHISH probability robustly
+        if phish_idx is not None and 0 <= phish_idx < len(probs_list):
+            phish_score = float(probs_list[phish_idx])
+        else:
+            # If we cannot resolve PHISH index, use argmax class prob
+            phish_score = float(probs_list[argmax_idx])
+        label = "PHISH" if phish_score >= PHISH_THRESHOLD else "LEGIT"
+        resp = {
+            "label": label,                      # client-compatible
+            "score": phish_score,                # probability of PHISH class
+            "predicted_index": argmax_idx,       # argmax over probs
+            "logits": logits[0].tolist(),        # raw logits
+            "probs": _probs_dict(probs_list),    # per-label probs
+            "id2label": _id2label_map(),
+            "phish_idx": phish_idx,
+            "legit_idx": legit_idx,
+            "threshold": PHISH_THRESHOLD,
+        }
+        return resp
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})