Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 3, 2025

Commit

113b42d

verified ·

1 Parent(s): 9e472d3

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -104

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# app.py
 import os
 from typing import List, Optional, Dict
@@ -7,7 +6,6 @@ from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 # Prefer MODEL_ID, fall back to HF_MODEL_ID, then default
 MODEL_ID = (
     os.environ.get("MODEL_ID")
@@ -15,7 +13,7 @@ MODEL_ID = (
     or "Perth0603/phishing-email-mobilebert"
 )
-app = FastAPI(title="Phishing Text Classifier", version="1.1.0")
 class PredictPayload(BaseModel):
@@ -28,7 +26,7 @@ class BatchPredictPayload(BaseModel):
 class LabeledText(BaseModel):
     text: str
-    label: Optional[str] = None  # optional ground truth for quick eval
 class EvalPayload(BaseModel):
@@ -39,19 +37,25 @@ _tokenizer = None
 _model = None
 _device = "cpu"
 def _normalize_label(txt: str) -> str:
-    # Optional: normalize common variants for simpler downstream use
-    t = (txt or "").strip().upper()
-    if t in ("PHISHING", "PHISH", "SPAM"):
         return "PHISH"
-    if t in ("LEGIT", "LEGITIMATE", "SAFE", "HAM"):
         return "LEGIT"
     return t
 def _load_model():
-    global _tokenizer, _model, _device
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -66,6 +70,26 @@ def _load_model():
                 .to(_device)
             ).logits
 def _predict_texts(texts: List[str]) -> List[Dict]:
     _load_model()
@@ -98,98 +122,5 @@ def _predict_texts(texts: List[str]) -> List[Dict]:
         raw_label = labels_by_idx[idx]
         norm_label = _normalize_label(raw_label)
-        # Also expose per-label probabilities
-        prob_map = { _normalize_label(labels_by_idx[j]): float(p[j].item()) for j in range(len(labels_by_idx)) }
-        outputs.append(
-            {
-                "label": norm_label,                # normalized (e.g., PHISH/LEGIT)
-                "raw_label": raw_label,             # from model.config.id2label
-                "score": float(p[idx].item()),      # max class probability
-                "probs": prob_map,                  # dict of label -> probability
-                "predicted_index": idx,
-            }
-        )
-    return outputs
-@app.get("/")
-def root():
-    return {"status": "ok", "model": MODEL_ID}
-@app.get("/debug/labels")
-def debug_labels():
-    _load_model()
-    return {
-        "id2label": getattr(_model.config, "id2label", {}),
-        "label2id": getattr(_model.config, "label2id", {}),
-        "num_labels": int(getattr(_model.config, "num_labels", 0)),
-        "device": _device,
-    }
-@app.post("/predict")
-def predict(payload: PredictPayload):
-    try:
-        res = _predict_texts([payload.inputs])
-        return res[0]
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Prediction error: {e}")
-@app.post("/predict-batch")
-def predict_batch(payload: BatchPredictPayload):
-    try:
-        return _predict_texts(payload.inputs)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Batch prediction error: {e}")
-@app.post("/evaluate")
-def evaluate(payload: EvalPayload):
-    """
-    Quick on-the-spot test with provided labeled samples.
-    Request body:
-    {
-      "samples": [
-        {"text": "Your parcel is held...", "label": "PHISH"},
-        {"text": "Lunch at 12?", "label": "LEGIT"}
-      ]
-    }
-    Returns accuracy and per-class counts.
-    """
-    try:
-        texts = [s.text for s in payload.samples]
-        gts = [(_normalize_label(s.label) if s.label else None) for s in payload.samples]
-        preds = _predict_texts(texts)
-        total = len(preds)
-        correct = 0
-        per_class = {}
-        for gt, pr in zip(gts, preds):
-            pred_label = pr["label"]
-            if gt is not None:
-                correct += int(gt == pred_label)
-                per_class.setdefault(gt, {"tp": 0, "count": 0})
-                per_class[gt]["count"] += 1
-                if gt == pred_label:
-                    per_class[gt]["tp"] += 1
-        acc = (correct / total) if total and any(gt is not None for gt in gts) else None
-        return {
-            "accuracy": acc,            # None if no ground truths provided
-            "total": total,
-            "predictions": preds,
-            "per_class": per_class,
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Evaluation error: {e}")
-if __name__ == "__main__":
-    # Run:  uvicorn app:app --host 0.0.0.0 --port 8000 --reload
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import os
 from typing import List, Optional, Dict
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 # Prefer MODEL_ID, fall back to HF_MODEL_ID, then default
 MODEL_ID = (
     os.environ.get("MODEL_ID")
     or "Perth0603/phishing-email-mobilebert"
 )
+app = FastAPI(title="Phishing Text Classifier", version="1.2.0")
 class PredictPayload(BaseModel):
 class LabeledText(BaseModel):
     text: str
+    label: Optional[str] = None  # optional ground truth for quick eval (accepts "0"/"1" or text)
 class EvalPayload(BaseModel):
 _model = None
 _device = "cpu"
+# Cached normalized mapping/meta
+_IDX_PHISH = None           # model output index that corresponds to PHISH
+_IDX_LEGIT = None           # model output index that corresponds to LEGIT
+_NORM_LABELS_BY_IDX = None  # normalized labels ordered by model indices
 def _normalize_label(txt: str) -> str:
+    # Normalize common variants and accept "0"/"1" from CSVs
+    t = (str(txt) if txt is not None else "").strip().upper()
+    if t in ("1", "PHISHING", "PHISH", "SPAM"):
         return "PHISH"
+    if t in ("0", "LEGIT", "LEGITIMATE", "SAFE", "HAM"):
         return "LEGIT"
     return t
 def _load_model():
+    global _tokenizer, _model, _device, _IDX_PHISH, _IDX_LEGIT, _NORM_LABELS_BY_IDX
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
                 .to(_device)
             ).logits
+        # Derive normalized labels per index and cache PHISH/LEGIT indices
+        id2label = getattr(_model.config, "id2label", {}) or {}
+        num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
+        _NORM_LABELS_BY_IDX = [_normalize_label(id2label.get(i, f"LABEL_{i}")) for i in range(num_labels)]
+        # Try to locate PHISH/LEGIT indices explicitly
+        try:
+            _IDX_PHISH = _NORM_LABELS_BY_IDX.index("PHISH")
+        except ValueError:
+            _IDX_PHISH = None
+        try:
+            _IDX_LEGIT = _NORM_LABELS_BY_IDX.index("LEGIT")
+        except ValueError:
+            _IDX_LEGIT = None
+        # If labels are unknown but binary, you can optionally set a default mapping.
+        # Commented out by default to avoid wrong assumptions:
+        # if _IDX_PHISH is None and _IDX_LEGIT is None and num_labels == 2:
+        #     _IDX_LEGIT, _IDX_PHISH = 0, 1  # assumes index 1 = PHISH, index 0 = LEGIT
 def _predict_texts(texts: List[str]) -> List[Dict]:
     _load_model()
         raw_label = labels_by_idx[idx]
         norm_label = _normalize_label(raw_label)
+        # Also expose per-label probabilities (normalized names where possible)
+        prob_map = {_normalize_label(labels_by