Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 3, 2025

Commit

5bdaec2

verified ·

1 Parent(s): 72eb3f5

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -133

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import List, Optional, Dict
 import torch
 from fastapi import FastAPI, HTTPException
@@ -13,9 +13,16 @@ MODEL_ID = (
     or "Perth0603/phishing-email-mobilebert"
 )
-app = FastAPI(title="Phishing Text Classifier", version="1.2.0")
 class PredictPayload(BaseModel):
     inputs: str
@@ -26,25 +33,28 @@ class BatchPredictPayload(BaseModel):
 class LabeledText(BaseModel):
     text: str
-    label: Optional[str] = None  # optional ground truth for quick eval (accepts "0"/"1" or text)
 class EvalPayload(BaseModel):
     samples: List[LabeledText]
 _tokenizer = None
 _model = None
 _device = "cpu"
 # Cached normalized mapping/meta
-_IDX_PHISH = None           # model output index that corresponds to PHISH
-_IDX_LEGIT = None           # model output index that corresponds to LEGIT
-_NORM_LABELS_BY_IDX = None  # normalized labels ordered by model indices
 def _normalize_label(txt: str) -> str:
-    # Normalize common variants and accept "0"/"1" from CSVs
     t = (str(txt) if txt is not None else "").strip().upper()
     if t in ("1", "PHISHING", "PHISH", "SPAM"):
         return "PHISH"
@@ -53,50 +63,76 @@ def _normalize_label(txt: str) -> str:
     return t
 def _load_model():
-    global _tokenizer, _model, _device, _IDX_PHISH, _IDX_LEGIT, _NORM_LABELS_BY_IDX
-    if _tokenizer is None or _model is None:
-        _device = "cuda" if torch.cuda.is_available() else "cpu"
-        _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-        _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
-        _model.to(_device)
-        _model.eval()  # important: disable dropout etc.
-        # Warm-up
-        with torch.no_grad():
-            _ = _model(
-                **_tokenizer(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=512)
-                .to(_device)
-            ).logits
-        # Derive normalized labels per index and cache PHISH/LEGIT indices
-        id2label = getattr(_model.config, "id2label", {}) or {}
-        num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
-        _NORM_LABELS_BY_IDX = [_normalize_label(id2label.get(i, f"LABEL_{i}")) for i in range(num_labels)]
-        # Try to locate PHISH/LEGIT indices explicitly
-        try:
-            _IDX_PHISH = _NORM_LABELS_BY_IDX.index("PHISH")
-        except ValueError:
-            _IDX_PHISH = None
-        try:
-            _IDX_LEGIT = _NORM_LABELS_BY_IDX.index("LEGIT")
-        except ValueError:
-            _IDX_LEGIT = None
-        # If labels are unknown but binary, you can optionally set a default mapping.
-        # Commented out by default to avoid wrong assumptions:
-        # if _IDX_PHISH is None and _IDX_LEGIT is None and num_labels == 2:
-        #     _IDX_LEGIT, _IDX_PHISH = 0, 1  # assumes index 1 = PHISH, index 0 = LEGIT
-def _predict_texts(texts: List[str]) -> List[Dict]:
     _load_model()
     if not texts:
         return []
-    # Tokenize batch
     enc = _tokenizer(
         texts,
         return_tensors="pt",
@@ -110,45 +146,51 @@ def _predict_texts(texts: List[str]) -> List[Dict]:
         logits = _model(**enc).logits
         probs = torch.softmax(logits, dim=-1)  # [batch, num_labels]
-    # Use the model’s own mapping
-    id2label = getattr(_model.config, "id2label", None) or {}
-    # Build a stable label list by index
-    labels_by_idx = [id2label.get(i, f"LABEL_{i}") for i in range(probs.shape[-1])]
     outputs: List[Dict] = []
     for i in range(probs.shape[0]):
         p = probs[i]
         idx = int(torch.argmax(p).item())
-        raw_label = labels_by_idx[idx]
-        norm_label = _normalize_label(raw_label)
-        # Also expose per-label probabilities (normalized names where possible)
-        prob_map = {_normalize_label(labels_by_idx[j]): float(p[j].item()) for j in range(len(labels_by_idx))}
-        # Map to your dataset convention: PHISH=1, LEGIT=0
-        ds_label = None
-        if _IDX_PHISH is not None and _IDX_LEGIT is not None:
-            if idx == _IDX_PHISH:
-                ds_label = 1
-            elif idx == _IDX_LEGIT:
-                ds_label = 0
-        # Per-dataset-label probabilities when both indices are known
-        probs_by_dataset = None
-        if _IDX_PHISH is not None and _IDX_LEGIT is not None:
-            probs_by_dataset = {
-                "1": float(p[_IDX_PHISH].item()),  # PHISH
-                "0": float(p[_IDX_LEGIT].item()),  # LEGIT
-            }
         outputs.append(
             {
-                "label": norm_label,                 # normalized (e.g., PHISH/LEGIT)
-                "raw_label": raw_label,              # from model.config.id2label
-                "score": float(p[idx].item()),       # max class probability
-                "probs": prob_map,                   # dict of normalized label -> probability
-                "predicted_index": idx,              # model argmax index
-                "predicted_dataset_label": ds_label, # 1 for PHISH, 0 for LEGIT (your convention)
                 "probs_by_dataset_label": probs_by_dataset,
             }
         )
@@ -156,6 +198,7 @@ def _predict_texts(texts: List[str]) -> List[Dict]:
     return outputs
 @app.get("/")
 def root():
     return {"status": "ok", "model": MODEL_ID}
@@ -175,10 +218,24 @@ def debug_labels():
     }
 @app.post("/predict")
 def predict(payload: PredictPayload):
     try:
-        res = _predict_texts([payload.inputs])
         return res[0]
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Prediction error: {e}")
@@ -187,58 +244,4 @@ def predict(payload: PredictPayload):
 @app.post("/predict-batch")
 def predict_batch(payload: BatchPredictPayload):
     try:
-        return _predict_texts(payload.inputs)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Batch prediction error: {e}")
-@app.post("/evaluate")
-def evaluate(payload: EvalPayload):
-    """
-    Quick on-the-spot test with provided labeled samples.
-    Request body:
-    {
-      "samples": [
-        {"text": "Your parcel is held...", "label": "PHISH"},  # or "1"
-        {"text": "Lunch at 12?", "label": "LEGIT"}             # or "0"
-      ]
-    }
-    Returns accuracy and per-class counts.
-    """
-    try:
-        texts = [s.text for s in payload.samples]
-        gts = [(_normalize_label(s.label) if s.label is not None else None) for s in payload.samples]
-        preds = _predict_texts(texts)
-        total = len(preds)
-        correct = 0
-        per_class: Dict[str, Dict[str, int]] = {}
-        for gt, pr in zip(gts, preds):
-            pred_label = pr["label"]
-            if gt is not None:
-                correct += int(gt == pred_label)
-                per_class.setdefault(gt, {"tp": 0, "count": 0})
-                per_class[gt]["count"] += 1
-                if gt == pred_label:
-                    per_class[gt]["tp"] += 1
-        has_gts = any(gt is not None for gt in gts)
-        acc = (correct / sum(1 for gt in gts if gt is not None)) if has_gts else None
-        return {
-            "accuracy": acc,            # None if no ground truths provided
-            "total": total,
-            "predictions": preds,
-            "per_class": per_class,
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Evaluation error: {e}")
-if __name__ == "__main__":
-    # Run:  uvicorn app:app --host 0.0.0.0 --port 8000 --reload
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import os
+from typing import List, Optional, Dict, Tuple
 import torch
 from fastapi import FastAPI, HTTPException
     or "Perth0603/phishing-email-mobilebert"
 )
+# Optional: force mapping when model labels are unclear (binary only).
+# Example values:
+#   FORCE_BINARY_MAPPING="LEGIT,PHISH"  (index0=LEGIT, index1=PHISH)
+#   FORCE_BINARY_MAPPING="PHISH,LEGIT"  (index0=PHISH, index1=LEGIT)
+FORCE_BINARY_MAPPING = os.environ.get("FORCE_BINARY_MAPPING", "").strip().upper()
+app = FastAPI(title="Phishing Text Classifier", version="1.3.0")
+# ---------- Schemas ----------
 class PredictPayload(BaseModel):
     inputs: str
 class LabeledText(BaseModel):
     text: str
+    label: Optional[str] = None  # optional ground truth ("0"/"1" or text)
 class EvalPayload(BaseModel):
     samples: List[LabeledText]
+# ---------- Globals / cache ----------
 _tokenizer = None
 _model = None
 _device = "cpu"
 # Cached normalized mapping/meta
+_IDX_PHISH: Optional[int] = None           # model output index that corresponds to PHISH
+_IDX_LEGIT: Optional[int] = None           # model output index that corresponds to LEGIT
+_NORM_LABELS_BY_IDX: Optional[List[str]] = None  # normalized labels ordered by model indices
+_USED_FORCED_MAPPING: bool = False         # whether FORCE_BINARY_MAPPING took effect
+# ---------- Helpers ----------
 def _normalize_label(txt: str) -> str:
+    """Normalize common variants and accept "0"/"1" from CSVs."""
     t = (str(txt) if txt is not None else "").strip().upper()
     if t in ("1", "PHISHING", "PHISH", "SPAM"):
         return "PHISH"
     return t
+def _try_force_binary_mapping(num_labels: int) -> Tuple[Optional[int], Optional[int], bool]:
+    """Apply FORCE_BINARY_MAPPING env var if provided and binary."""
+    if num_labels != 2 or not FORCE_BINARY_MAPPING:
+        return None, None, False
+    parts = [p.strip() for p in FORCE_BINARY_MAPPING.split(",") if p.strip()]
+    if len(parts) != 2 or any(p not in ("PHISH", "LEGIT") for p in parts):
+        return None, None, False
+    # parts[0] is index 0, parts[1] is index 1
+    idx_legit = 0 if parts[0] == "LEGIT" else 1 if parts[1] == "LEGIT" else None
+    idx_phish = 0 if parts[0] == "PHISH" else 1 if parts[1] == "PHISH" else None
+    if idx_legit is None or idx_phish is None:
+        return None, None, False
+    return idx_phish, idx_legit, True
 def _load_model():
+    """Load model/tokenizer and derive stable label mapping."""
+    global _tokenizer, _model, _device, _IDX_PHISH, _IDX_LEGIT, _NORM_LABELS_BY_IDX, _USED_FORCED_MAPPING
+    if _tokenizer is not None and _model is not None:
+        return
+    _device = "cuda" if torch.cuda.is_available() else "cpu"
+    _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
+    _model.to(_device)
+    _model.eval()
+    # Warm-up
+    with torch.no_grad():
+        _ = _model(
+            **_tokenizer(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=512)
+            .to(_device)
+        ).logits
+    # Derive normalized labels per index
+    id2label = getattr(_model.config, "id2label", {}) or {}
+    num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
+    _NORM_LABELS_BY_IDX = [_normalize_label(id2label.get(i, f"LABEL_{i}")) for i in range(num_labels)]
+    # 1) Try explicit indices from normalized labels
+    _IDX_PHISH = None
+    _IDX_LEGIT = None
+    try:
+        _IDX_PHISH = _NORM_LABELS_BY_IDX.index("PHISH")
+    except ValueError:
+        pass
+    try:
+        _IDX_LEGIT = _NORM_LABELS_BY_IDX.index("LEGIT")
+    except ValueError:
+        pass
+    # 2) If still unknown and binary, allow forced mapping
+    _USED_FORCED_MAPPING = False
+    if (_IDX_PHISH is None or _IDX_LEGIT is None) and num_labels == 2:
+        fp, fl, used = _try_force_binary_mapping(num_labels)
+        if used:
+            _IDX_PHISH, _IDX_LEGIT = fp, fl
+            _USED_FORCED_MAPPING = True
+    # 3) If still unknown, we keep them None and ONLY return model-native labels.
+    #    （不进行臆测，避免再次搞反）
+def _postprocess_batch_logits(texts: List[str]) -> List[Dict]:
+    """Compute predictions + provide robust, unambiguous fields for UI."""
     _load_model()
     if not texts:
         return []
     enc = _tokenizer(
         texts,
         return_tensors="pt",
         logits = _model(**enc).logits
         probs = torch.softmax(logits, dim=-1)  # [batch, num_labels]
+    id2label = getattr(_model.config, "id2label", {}) or {}
+    labels_by_idx_raw = [id2label.get(i, f"LABEL_{i}") for i in range(probs.shape[-1])]
+    labels_by_idx_norm = [_normalize_label(x) for x in labels_by_idx_raw]
     outputs: List[Dict] = []
     for i in range(probs.shape[0]):
         p = probs[i]
         idx = int(torch.argmax(p).item())
+        raw_label = labels_by_idx_raw[idx]
+        norm_label = labels_by_idx_norm[idx]
+        # normalized probs dict
+        prob_map = {labels_by_idx_norm[j]: float(p[j].item()) for j in range(len(labels_by_idx_norm))}
+        # Default display (robust): if我们能确定 PHISH/LEGIT 下标，就用它；否则用norm_label回退
+        can_map_dataset = (_IDX_PHISH is not None and _IDX_LEGIT is not None)
+        if can_map_dataset:
+            phish_prob = float(p[_IDX_PHISH].item())
+            legit_prob = float(p[_IDX_LEGIT].item())
+            is_phish = phish_prob >= legit_prob
+            dataset_label = "1" if is_phish else "0"  # 按你的数据集约定：1=PHISH, 0=LEGIT
+            display_label = "phishing" if is_phish else "legitimate"
+            probs_by_dataset = {"1": phish_prob, "0": legit_prob}
+        else:
+            # 回退策略：用当前pred的规范化标签
+            is_phish = (norm_label == "PHISH")
+            dataset_label = "1" if is_phish else "0"
+            display_label = "phishing" if is_phish else "legitimate"
+            probs_by_dataset = None  # unknown mapping
         outputs.append(
             {
+                # —— 建议前端优先使用这三个字段，不会搞反 ——
+                "is_phish": is_phish,
+                "dataset_label": dataset_label,   # "1"=PHISH, "0"=LEGIT
+                "display_label": display_label,   # "phishing"/"legitimate"
+                # —— 诊断/兼容字段 ——
+                "label": norm_label,              # 规范化后的（PHISH/LEGIT/未知）
+                "raw_label": raw_label,           # 来自 model.config.id2label
+                "score": float(p[idx].item()),    # argmax 概率
+                "probs": prob_map,                # 规范化名 -> 概率
+                "predicted_index": idx,           # 模型 argmax 下标
+                "predicted_dataset_label": (1 if is_phish else 0),  # int，等价于上面的字符串
                 "probs_by_dataset_label": probs_by_dataset,
             }
         )
     return outputs
+# ---------- Routes ----------
 @app.get("/")
 def root():
     return {"status": "ok", "model": MODEL_ID}
     }
+@app.get("/debug/mapping")
+def debug_mapping():
+    _load_model()
+    num_labels = int(getattr(_model.config, "num_labels", 0))
+    return {
+        "forced_mapping_env": FORCE_BINARY_MAPPING or None,
+        "used_forced_mapping": _USED_FORCED_MAPPING,
+        "num_labels": num_labels,
+        "can_map_dataset": (_IDX_PHISH is not None and _IDX_LEGIT is not None),
+        "idx_phish": _IDX_PHISH,
+        "idx_legit": _IDX_LEGIT,
+    }
 @app.post("/predict")
 def predict(payload: PredictPayload):
     try:
+        res = _postprocess_batch_logits([payload.inputs])
         return res[0]
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Prediction error: {e}")
 @app.post("/predict-batch")
 def predict_batch(payload: BatchPredictPayload):
     try:
+        return _postprocess_batch_logits(payload.inp_