Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 3, 2025

Commit

9e472d3

verified ·

1 Parent(s): 7690e39

Update app.py

Browse files

Files changed (1) hide show

app.py +166 -18

app.py CHANGED Viewed

@@ -1,31 +1,116 @@
-from fastapi import FastAPI
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import torch
-import os
-MODEL_ID = os.environ.get("MODEL_ID", "dima806/phishing-email-detection")
-app = FastAPI(title="Phishing Text Classifier", version="1.0.0")
 class PredictPayload(BaseModel):
     inputs: str
-# Lazy singletons for model/tokenizer
 _tokenizer = None
 _model = None
 def _load_model():
-    global _tokenizer, _model
     if _tokenizer is None or _model is None:
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
         # Warm-up
         with torch.no_grad():
-            _ = _model(**_tokenizer(["warm up"], return_tensors="pt")).logits
 @app.get("/")
@@ -33,15 +118,78 @@ def root():
     return {"status": "ok", "model": MODEL_ID}
 @app.post("/predict")
 def predict(payload: PredictPayload):
-    _load_model()
-    with torch.no_grad():
-        logits = _model(**_tokenizer([payload.inputs], return_tensors="pt")).logits
-        probs = torch.softmax(logits, dim=-1)[0]
-        score, idx = torch.max(probs, dim=0)
-    # Map common ids to labels (kept generic; your config also has these)
-    id2label = {0: "LEGIT", 1: "PHISH"}
-    label = id2label.get(int(idx), str(int(idx)))
-    return {"label": label, "score": float(score)}

+# app.py
+import os
+from typing import List, Optional, Dict
+import torch
+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# Prefer MODEL_ID, fall back to HF_MODEL_ID, then default
+MODEL_ID = (
+    os.environ.get("MODEL_ID")
+    or os.environ.get("HF_MODEL_ID")
+    or "Perth0603/phishing-email-mobilebert"
+)
+app = FastAPI(title="Phishing Text Classifier", version="1.1.0")
 class PredictPayload(BaseModel):
     inputs: str
+class BatchPredictPayload(BaseModel):
+    inputs: List[str]
+class LabeledText(BaseModel):
+    text: str
+    label: Optional[str] = None  # optional ground truth for quick eval
+class EvalPayload(BaseModel):
+    samples: List[LabeledText]
 _tokenizer = None
 _model = None
+_device = "cpu"
+def _normalize_label(txt: str) -> str:
+    # Optional: normalize common variants for simpler downstream use
+    t = (txt or "").strip().upper()
+    if t in ("PHISHING", "PHISH", "SPAM"):
+        return "PHISH"
+    if t in ("LEGIT", "LEGITIMATE", "SAFE", "HAM"):
+        return "LEGIT"
+    return t
 def _load_model():
+    global _tokenizer, _model, _device
     if _tokenizer is None or _model is None:
+        _device = "cuda" if torch.cuda.is_available() else "cpu"
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
+        _model.to(_device)
+        _model.eval()  # important: disable dropout etc.
         # Warm-up
         with torch.no_grad():
+            _ = _model(
+                **_tokenizer(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=512)
+                .to(_device)
+            ).logits
+def _predict_texts(texts: List[str]) -> List[Dict]:
+    _load_model()
+    if not texts:
+        return []
+    # Tokenize batch
+    enc = _tokenizer(
+        texts,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=512,
+    )
+    enc = {k: v.to(_device) for k, v in enc.items()}
+    with torch.no_grad():
+        logits = _model(**enc).logits
+        probs = torch.softmax(logits, dim=-1)  # [batch, num_labels]
+    # Use the model’s own mapping
+    id2label = getattr(_model.config, "id2label", None) or {}
+    # Build a stable label list by index
+    labels_by_idx = [id2label.get(i, f"LABEL_{i}") for i in range(probs.shape[-1])]
+    outputs: List[Dict] = []
+    for i in range(probs.shape[0]):
+        p = probs[i]
+        idx = int(torch.argmax(p).item())
+        raw_label = labels_by_idx[idx]
+        norm_label = _normalize_label(raw_label)
+        # Also expose per-label probabilities
+        prob_map = { _normalize_label(labels_by_idx[j]): float(p[j].item()) for j in range(len(labels_by_idx)) }
+        outputs.append(
+            {
+                "label": norm_label,                # normalized (e.g., PHISH/LEGIT)
+                "raw_label": raw_label,             # from model.config.id2label
+                "score": float(p[idx].item()),      # max class probability
+                "probs": prob_map,                  # dict of label -> probability
+                "predicted_index": idx,
+            }
+        )
+    return outputs
 @app.get("/")
     return {"status": "ok", "model": MODEL_ID}
+@app.get("/debug/labels")
+def debug_labels():
+    _load_model()
+    return {
+        "id2label": getattr(_model.config, "id2label", {}),
+        "label2id": getattr(_model.config, "label2id", {}),
+        "num_labels": int(getattr(_model.config, "num_labels", 0)),
+        "device": _device,
+    }
 @app.post("/predict")
 def predict(payload: PredictPayload):
+    try:
+        res = _predict_texts([payload.inputs])
+        return res[0]
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Prediction error: {e}")
+@app.post("/predict-batch")
+def predict_batch(payload: BatchPredictPayload):
+    try:
+        return _predict_texts(payload.inputs)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Batch prediction error: {e}")
+@app.post("/evaluate")
+def evaluate(payload: EvalPayload):
+    """
+    Quick on-the-spot test with provided labeled samples.
+    Request body:
+    {
+      "samples": [
+        {"text": "Your parcel is held...", "label": "PHISH"},
+        {"text": "Lunch at 12?", "label": "LEGIT"}
+      ]
+    }
+    Returns accuracy and per-class counts.
+    """
+    try:
+        texts = [s.text for s in payload.samples]
+        gts = [(_normalize_label(s.label) if s.label else None) for s in payload.samples]
+        preds = _predict_texts(texts)
+        total = len(preds)
+        correct = 0
+        per_class = {}
+        for gt, pr in zip(gts, preds):
+            pred_label = pr["label"]
+            if gt is not None:
+                correct += int(gt == pred_label)
+                per_class.setdefault(gt, {"tp": 0, "count": 0})
+                per_class[gt]["count"] += 1
+                if gt == pred_label:
+                    per_class[gt]["tp"] += 1
+        acc = (correct / total) if total and any(gt is not None for gt in gts) else None
+        return {
+            "accuracy": acc,            # None if no ground truths provided
+            "total": total,
+            "predictions": preds,
+            "per_class": per_class,
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Evaluation error: {e}")
+if __name__ == "__main__":
+    # Run:  uvicorn app:app --host 0.0.0.0 --port 8000 --reload
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)