Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 3, 2025

Commit

9dfcd1b

verified ·

1 Parent(s): 820a438

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -108

app.py CHANGED Viewed

@@ -1,111 +1,43 @@
-import os
-os.environ.setdefault("HOME", "/data")
-os.environ.setdefault("XDG_CACHE_HOME", "/data/.cache")
-os.environ.setdefault("HF_HOME", "/data/.cache")
-os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
-os.environ.setdefault("TORCH_HOME", "/data/.cache")
-from fastapi import FastAPI
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import torch
-MODEL_ID = os.environ.get("MODEL_ID", "Perth0603/phishing-email-mobilebert")
-# Ensure writable cache directory for HF/torch inside Spaces Docker
-CACHE_DIR = os.environ.get("HF_CACHE_DIR", "/data/.cache")
-os.makedirs(CACHE_DIR, exist_ok=True)
-app = FastAPI(title="Phishing Text Classifier", version="1.0.0")
-class PredictPayload(BaseModel):
-    inputs: str
-# Lazy singletons for model/tokenizer
-_tokenizer = None
-_model = None
-def _load_model():
-    global _tokenizer, _model
-    if _tokenizer is None or _model is None:
-        _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
-        _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
-        _model.eval()  # inference mode
-        # Warm-up
-        with torch.no_grad():
-            _ = _model(**_tokenizer(["warm up"], return_tensors="pt")).logits
-def _id2label():
     cfg = getattr(_model, "config", None)
-    return getattr(cfg, "id2label", {}) if cfg else {}
-@app.get("/")
-def root():
-    _load_model()
-    cfg = getattr(_model, "config", None)
-    return {
-        "status": "ok",
-        "model": MODEL_ID,
-        "num_labels": int(getattr(cfg, "num_labels", 2)) if cfg else 2,
-    }
-@app.get("/labels")
-def labels():
-    _load_model()
     cfg = getattr(_model, "config", None)
-    return {
-        "id2label": getattr(cfg, "id2label", {}) if cfg else {},
-        "label2id": getattr(cfg, "label2id", {}) if cfg else {},
-    }
-@app.post("/predict")
-def predict(payload: PredictPayload):
-    try:
-        _load_model()
-        with torch.no_grad():
-            inputs = _tokenizer(
-                [payload.inputs],
-                return_tensors="pt",
-                truncation=True,
-                max_length=512
-            )
-            outputs = _model(**inputs)
-            logits = outputs.logits  # [1, num_labels]
-            logits_list = logits[0].tolist()
-            pred_idx = int(torch.argmax(logits, dim=-1).item())
-            # Keep client-compatible fields but also provide raw outputs
-            probs_t = torch.softmax(logits, dim=-1)[0]
-            score = float(probs_t[pred_idx])
-    except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})
-    id2label = _id2label()
-    # Resolve label from model config (support int or str keys)
-    pred_label = id2label.get(pred_idx, id2label.get(str(pred_idx), str(pred_idx)))
-    # Build per-label probabilities for debugging/verification
-    probs = {}
-    for i, p in enumerate(probs_t.tolist()):
-        probs[id2label.get(i, id2label.get(str(i), str(i)))] = float(p)
-    # Backward-compatible keys: "label" and "score"
-    return {
-        "label": pred_label,            # expected by your client
-        "score": score,                 # probability of predicted class (softmax)
-        "predicted_index": pred_idx,    # raw argmax index from logits
-        "logits": logits_list,          # raw model output
-        "probs": probs,                 # per-label probabilities
-        "id2label": id2label,
-        "label2id": getattr(getattr(_model, "config", None), "label2id", {}),
-    }

+def _normalize_label_name(name: str) -> str:
+    if not isinstance(name, str):
+        return ""
+    return name.strip().lower()
+def _resolve_indices_from_config():
+    # Returns (phish_idx, legit_idx) using model-config names and sensible fallbacks
     cfg = getattr(_model, "config", None)
+    id2label = getattr(cfg, "id2label", {}) if cfg else {}
+    # Normalize keys to int
+    norm = {}
+    for k, v in id2label.items():
+        try:
+            ik = int(k)
+        except Exception:
+            continue
+        norm[ik] = _normalize_label_name(v)
+    # Try to detect via keywords
+    phish_keywords = {"phish", "phishing", "spam", "scam", "malicious"}
+    legit_keywords = {"legit", "ham", "safe", "benign", "not phish", "non-phish"}
+    phish_idx = None
+    legit_idx = None
+    for i, name in norm.items():
+        if any(kw in name for kw in phish_keywords):
+            phish_idx = i if phish_idx is None else phish_idx
+        if any(kw in name for kw in legit_keywords):
+            legit_idx = i if legit_idx is None else legit_idx
+    # Fallback conventions for binary heads
+    if phish_idx is None or legit_idx is None:
+        if len(norm) == 2:
+            # Common convention: 0 = negative(legit), 1 = positive(phish)
+            phish_idx = 1 if phish_idx is None else phish_idx
+            legit_idx = 0 if legit_idx is None else legit_idx
+    return phish_idx, legit_idx
+def _label_for_index(idx: int) -> str:
     cfg = getattr(_model, "config", None)
+    id2label = getattr(cfg, "id2label", {}) if cfg else {}
+    return id2label.get(idx, id2label.get(str(idx), str(idx)))