Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 3, 2025

Commit

2b92082

verified ·

1 Parent(s): 9df8bf6

Update app.py

Browse files

Files changed (1) hide show

app.py +190 -132

app.py CHANGED Viewed

@@ -1,164 +1,203 @@
 import os
-from typing import List, Optional, Dict, Tuple
 import torch
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-# ====== 模型来源 ======
-# 默认从本地目录加载（你上传的文件在 /mnt/data）
-MODEL_DIR = os.environ.get("MODEL_DIR", "Perth0603/phishing-email-mobilebert")
-# 可选：当模型没写清标签且为二分类时，强制指定顺序（这里通常不需要）
-# 例：FORCE_BINARY_MAPPING="LEGIT,PHISH" 或 "PHISH,LEGIT"
-FORCE_BINARY_MAPPING = os.environ.get("FORCE_BINARY_MAPPING", "").strip().upper()
-app = FastAPI(title="Phishing Text Classifier", version="1.3.1")
-# ====== Schemas ======
 class PredictPayload(BaseModel):
     inputs: str
 class BatchPredictPayload(BaseModel):
     inputs: List[str]
 class LabeledText(BaseModel):
     text: str
-    label: Optional[str] = None  # "0"/"1" 或文本
 class EvalPayload(BaseModel):
     samples: List[LabeledText]
-# ====== Globals ======
 _tokenizer = None
 _model = None
 _device = "cpu"
-_IDX_PHISH: Optional[int] = None
-_IDX_LEGIT: Optional[int] = None
-_NORM_LABELS_BY_IDX: Optional[List[str]] = None
-_USED_FORCED_MAPPING: bool = False
-# ====== Helpers ======
-def _normalize_label(txt: str) -> str:
     t = (str(txt) if txt is not None else "").strip().upper()
-    if t in ("1", "PHISHING", "PHISH", "SPAM"):
         return "PHISH"
-    if t in ("0", "LEGIT", "LEGITIMATE", "SAFE", "HAM"):
         return "LEGIT"
     return t
-def _try_force_binary_mapping(num_labels: int) -> Tuple[Optional[int], Optional[int], bool]:
-    if num_labels != 2 or not FORCE_BINARY_MAPPING:
-        return None, None, False
-    parts = [p.strip() for p in FORCE_BINARY_MAPPING.split(",") if p.strip()]
-    if len(parts) != 2 or any(p not in ("PHISH", "LEGIT") for p in parts):
-        return None, None, False
-    idx_legit = 0 if parts[0] == "LEGIT" else 1 if parts[1] == "LEGIT" else None
-    idx_phish = 0 if parts[0] == "PHISH" else 1 if parts[1] == "PHISH" else None
-    if idx_legit is None or idx_phish is None:
-        return None, None, False
-    return idx_phish, idx_legit, True
-def _load_model():
-    global _tokenizer, _model, _device, _IDX_PHISH, _IDX_LEGIT, _NORM_LABELS_BY_IDX, _USED_FORCED_MAPPING
-    if _tokenizer is not None and _model is not None:
-        return
-    _device = "cuda" if torch.cuda.is_available() else "cpu"
-    _tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
-    _model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
-    _model.to(_device)
-    _model.eval()
-    with torch.no_grad():
-        _ = _model(
-            **_tokenizer(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=512).to(_device)
-        ).logits
-    id2label = getattr(_model.config, "id2label", {}) or {}
-    num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
-    _NORM_LABELS_BY_IDX = [_normalize_label(id2label.get(i, f"LABEL_{i}")) for i in range(num_labels)]
-    _IDX_PHISH = None
-    _IDX_LEGIT = None
-    try:
-        _IDX_PHISH = _NORM_LABELS_BY_IDX.index("PHISH")
-    except ValueError:
-        pass
-    try:
-        _IDX_LEGIT = _NORM_LABELS_BY_IDX.index("LEGIT")
-    except ValueError:
-        pass
-    _USED_FORCED_MAPPING = False
-    if (_IDX_PHISH is None or _IDX_LEGIT is None) and num_labels == 2:
-        fp, fl, used = _try_force_binary_mapping(num_labels)
-        if used:
-            _IDX_PHISH, _IDX_LEGIT = fp, fl
-            _USED_FORCED_MAPPING = True
-    # 你的模型文件已经写明：0=LEGIT, 1=PHISH，通常这里会自动识别出来。
-def _postprocess(texts: List[str]) -> List[Dict]:
     _load_model()
     if not texts:
         return []
-    enc = _tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
     enc = {k: v.to(_device) for k, v in enc.items()}
     with torch.no_grad():
         logits = _model(**enc).logits
-        probs = torch.softmax(logits, dim=-1)
-    id2label = getattr(_model.config, "id2label", {}) or {}
-    labels_by_idx_raw = [id2label.get(i, f"LABEL_{i}") for i in range(probs.shape[-1])]
-    labels_by_idx_norm = [_normalize_label(x) for x in labels_by_idx_raw]
-    outs: List[Dict] = []
     for i in range(probs.shape[0]):
         p = probs[i]
         idx = int(torch.argmax(p).item())
-        raw_label = labels_by_idx_raw[idx]
-        norm_label = labels_by_idx_norm[idx]
-        prob_map = {labels_by_idx_norm[j]: float(p[j].item()) for j in range(len(labels_by_idx_norm))}
-        # —— 核心：用明确的下标来给出“数据集标签”和“UI标签”
-        can_map = (_IDX_PHISH is not None and _IDX_LEGIT is not None)
-        if can_map:
-            phish_prob = float(p[_IDX_PHISH].item())
-            legit_prob = float(p[_IDX_LEGIT].item())
-            is_phish = phish_prob >= legit_prob
-            dataset_label = "1" if is_phish else "0"        # 1=PHISH, 0=LEGIT
-            display_label = "phishing" if is_phish else "legitimate"
-            probs_by_dataset = {"1": phish_prob, "0": legit_prob}
-        else:
-            # 回退：用规范化标签
-            is_phish = (norm_label == "PHISH")
-            dataset_label = "1" if is_phish else "0"
-            display_label = "phishing" if is_phish else "legitimate"
-            probs_by_dataset = None
-        outs.append({
-            "is_phish": is_phish,                     # 前端用它来显示
-            "dataset_label": dataset_label,           # "1"=PHISH, "0"=LEGIT
-            "display_label": display_label,           # "phishing"/"legitimate"
-            "label": norm_label,                      # 规范化（兼容/排错）
-            "raw_label": raw_label,                   # 原始模型标签
-            "score": float(p[idx].item()),
-            "probs": prob_map,
-            "predicted_index": idx,
-            "predicted_dataset_label": 1 if is_phish else 0,
-            "probs_by_dataset_label": probs_by_dataset,
-        })
-    return outs
-# ====== Routes ======
 @app.get("/")
 def root():
-    return {"status": "ok", "model_dir": MODEL_DIR}
 @app.get("/debug/labels")
 def debug_labels():
@@ -171,64 +210,83 @@ def debug_labels():
         "norm_labels_by_idx": _NORM_LABELS_BY_IDX,
         "idx_phish": _IDX_PHISH,
         "idx_legit": _IDX_LEGIT,
     }
-@app.get("/debug/mapping")
-def debug_mapping():
-    _load_model()
-    num_labels = int(getattr(_model.config, "num_labels", 0))
-    return {
-        "forced_mapping_env": FORCE_BINARY_MAPPING or None,
-        "used_forced_mapping": _USED_FORCED_MAPPING,
-        "num_labels": num_labels,
-        "can_map_dataset": (_IDX_PHISH is not None and _IDX_LEGIT is not None),
-        "idx_phish": _IDX_PHISH,
-        "idx_legit": _IDX_LEGIT,
-    }
 @app.post("/predict")
 def predict(payload: PredictPayload):
     try:
-        return _postprocess([payload.inputs])[0]
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Prediction error: {e}")
 @app.post("/predict-batch")
 def predict_batch(payload: BatchPredictPayload):
     try:
-        return _postprocess(payload.inputs)
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Batch prediction error: {e}")
 @app.post("/evaluate")
 def evaluate(payload: EvalPayload):
     try:
         texts = [s.text for s in payload.samples]
-        gts = [(_normalize_label(s.label) if s.label is not None else None) for s in payload.samples]
-        preds = _postprocess(texts)
         total = len(preds)
         correct = 0
         per_class: Dict[str, Dict[str, int]] = {}
         for gt, pr in zip(gts, preds):
-            pred_norm = "PHISH" if pr["is_phish"] else "LEGIT"
-            if gt is not None:
-                correct += int(gt == pred_norm)
                 per_class.setdefault(gt, {"tp": 0, "count": 0})
                 per_class[gt]["count"] += 1
-                if gt == pred_norm:
                     per_class[gt]["tp"] += 1
         has_gts = any(gt is not None for gt in gts)
         denom = sum(1 for gt in gts if gt is not None)
         acc = (correct / denom) if (has_gts and denom > 0) else None
-        return {"accuracy": acc, "total": total, "predictions": preds, "per_class": per_class}
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Evaluation error: {e}")
 if __name__ == "__main__":
-    # 启动：uvicorn app:app --host 0.0.0.0 --port 8000 --reload
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)

 import os
+from typing import List, Optional, Dict
 import torch
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# Prefer MODEL_ID, fall back to HF_MODEL_ID, then default
+MODEL_ID = (
+    os.environ.get("MODEL_ID")
+    or os.environ.get("HF_MODEL_ID")
+    or "Perth0603/phishing-email-mobilebert"
+)
+# =========================
+# 数据集 0/1 映射的可配置开关
+# =========================
+# 如果你的 CSV 中 1=PHISH，0=LEGIT（常见约定），保持默认即可
+# 如果你的 CSV 中 0=PHISH，1=LEGIT，请把 DATASET_PHISH_VALUE 设为 "0"
+DATASET_PHISH_VALUE = (os.environ.get("DATASET_PHISH_VALUE") or "1").strip()
+if DATASET_PHISH_VALUE not in {"0", "1"}:
+    DATASET_PHISH_VALUE = "1"  # 容错：非法值时回退到默认
+DATASET_LEGIT_VALUE = "0" if DATASET_PHISH_VALUE == "1" else "1"
+app = FastAPI(title="Phishing Text Classifier", version="1.3.0")
 class PredictPayload(BaseModel):
     inputs: str
 class BatchPredictPayload(BaseModel):
     inputs: List[str]
 class LabeledText(BaseModel):
     text: str
+    label: Optional[str] = None  # optional ground truth for quick eval (accepts "0"/"1" or text)
 class EvalPayload(BaseModel):
     samples: List[LabeledText]
 _tokenizer = None
 _model = None
 _device = "cpu"
+# Cached normalized mapping/meta
+_IDX_PHISH = None           # model output index that corresponds to PHISH
+_IDX_LEGIT = None           # model output index that corresponds to LEGIT
+_NORM_LABELS_BY_IDX = None  # normalized labels ordered by model indices
+def _normalize_label_text_only(txt: str) -> str:
+    """
+    仅做文字标准化，不解读 "0"/"1"。
+    用于模型 id2label -> 统一为 PHISH/LEGIT。
+    """
     t = (str(txt) if txt is not None else "").strip().upper()
+    if t in ("PHISHING", "PHISH", "SPAM"):
         return "PHISH"
+    if t in ("LEGIT", "LEGITIMATE", "SAFE", "HAM"):
         return "LEGIT"
     return t
+def _normalize_label_from_dataset(txt: str) -> Optional[str]:
+    """
+    把来自 CSV 的 "0"/"1" 或文字标签，统一成 PHISH/LEGIT。
+    这里会按 DATASET_PHISH_VALUE/LEGIT_VALUE 来解释 "0"/"1"。
+    返回 None 表示无法识别（比如空）。
+    """
+    if txt is None:
+        return None
+    t = str(txt).strip().upper()
+    if t in ("0", "1"):
+        if t == DATASET_PHISH_VALUE:
+            return "PHISH"
+        else:
+            return "LEGIT"
+    # 文字也支持
+    t2 = _normalize_label_text_only(t)
+    if t2 in ("PHISH", "LEGIT"):
+        return t2
+    return None
+def _load_model():
+    global _tokenizer, _model, _device, _IDX_PHISH, _IDX_LEGIT, _NORM_LABELS_BY_IDX
+    if _tokenizer is None or _model is None:
+        _device = "cuda" if torch.cuda.is_available() else "cpu"
+        _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+        _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
+        _model.to(_device)
+        _model.eval()  # important: disable dropout etc.
+        # Warm-up
+        with torch.no_grad():
+            _ = _model(
+                **_tokenizer(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=512)
+                .to(_device)
+            ).logits
+        # 读取并标准化模型标签（按索引顺序）
+        id2label = getattr(_model.config, "id2label", {}) or {}
+        num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
+        _NORM_LABELS_BY_IDX = [_normalize_label_text_only(id2label.get(i, f"LABEL_{i}")) for i in range(num_labels)]
+        # 找出 PHISH/LEGIT 在 logits 中的索引
+        try:
+            _IDX_PHISH = _NORM_LABELS_BY_IDX.index("PHISH")
+        except ValueError:
+            _IDX_PHISH = None
+        try:
+            _IDX_LEGIT = _NORM_LABELS_BY_IDX.index("LEGIT")
+        except ValueError:
+            _IDX_LEGIT = None
+        # 若模型没提供可识别的标签，但只有 2 类，给出安全的保守默认（不强行假设）
+        # 这里不自动假设 0/1 的含义，避免再次反转；保留 None，让下游概率照常返回。
+        # 你也可以按需启用：
+        # if _IDX_PHISH is None and _IDX_LEGIT is None and num_labels == 2:
+        #     _IDX_LEGIT, _IDX_PHISH = 0, 1
+def _predict_texts(texts: List[str]) -> List[Dict]:
     _load_model()
     if not texts:
         return []
+    # Tokenize batch
+    enc = _tokenizer(
+        texts,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=512,
+    )
     enc = {k: v.to(_device) for k, v in enc.items()}
     with torch.no_grad():
         logits = _model(**enc).logits
+        probs = torch.softmax(logits, dim=-1)  # [batch, num_labels]
+    # Use the model’s own mapping
+    id2label = getattr(_model.config, "id2label", None) or {}
+    labels_by_idx = [_normalize_label_text_only(id2label.get(i, f"LABEL_{i}")) for i in range(probs.shape[-1])]
+    outputs: List[Dict] = []
     for i in range(probs.shape[0]):
         p = probs[i]
         idx = int(torch.argmax(p).item())
+        norm_label = labels_by_idx[idx]  # 已标准化为 PHISH/LEGIT 或原样回传
+        # 构建（标准化后的）各类概率映射
+        prob_map: Dict[str, float] = {}
+        for j, lbl in enumerate(labels_by_idx):
+            key = lbl if lbl in ("PHISH", "LEGIT") else f"CLASS_{j}"
+            prob_map[key] = float(p[j].item())
+        # ——把预测映射回你的 CSV 0/1——
+        # 只有在我们确实知道哪个 index 是 PHISH / LEGIT 时才赋值；否则返回 None，避免误导
+        ds_label: Optional[int] = None
+        probs_by_dataset: Optional[Dict[str, float]] = None
+        if _IDX_PHISH is not None and _IDX_LEGIT is not None:
+            ds_label = int(DATASET_PHISH_VALUE) if idx == _IDX_PHISH else int(DATASET_LEGIT_VALUE)
+            probs_by_dataset = {
+                DATASET_PHISH_VALUE: float(p[_IDX_PHISH].item()),   # 数据集里代表 PHISH 的数值（"0" 或 "1"）
+                DATASET_LEGIT_VALUE: float(p[_IDX_LEGIT].item()),   # 数据集里代表 LEGIT 的数值
+            }
+        outputs.append(
+            {
+                "label": norm_label if norm_label in ("PHISH", "LEGIT") else norm_label,  # 文字结果
+                "score": float(p[idx].item()),       # max class probability
+                "probs": prob_map,                   # 每类概率（键为 PHISH/LEGIT 或 CLASS_k）
+                "predicted_index": idx,              # 模型 argmax 索引
+                "predicted_dataset_label": ds_label, # 用你的数据集 0/1 表示的预测（对齐到 DATASET_*_VALUE）
+                "probs_by_dataset_label": probs_by_dataset,
+            }
+        )
+    return outputs
 @app.get("/")
 def root():
+    return {
+        "status": "ok",
+        "model": MODEL_ID,
+        "dataset_mapping": {
+            "PHISH_VALUE": DATASET_PHISH_VALUE,
+            "LEGIT_VALUE": DATASET_LEGIT_VALUE,
+        },
+    }
 @app.get("/debug/labels")
 def debug_labels():
         "norm_labels_by_idx": _NORM_LABELS_BY_IDX,
         "idx_phish": _IDX_PHISH,
         "idx_legit": _IDX_LEGIT,
+        "dataset_mapping": {
+            "PHISH_VALUE": DATASET_PHISH_VALUE,
+            "LEGIT_VALUE": DATASET_LEGIT_VALUE,
+        },
     }
 @app.post("/predict")
 def predict(payload: PredictPayload):
     try:
+        res = _predict_texts([payload.inputs])
+        return res[0]
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Prediction error: {e}")
 @app.post("/predict-batch")
 def predict_batch(payload: BatchPredictPayload):
     try:
+        return _predict_texts(payload.inputs)
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Batch prediction error: {e}")
 @app.post("/evaluate")
 def evaluate(payload: EvalPayload):
+    """
+    Quick on-the-spot test with provided labeled samples.
+    Request body:
+    {
+      "samples": [
+        {"text": "Your parcel is held...", "label": "PHISH"},  # or "0"/"1"（按你的数据集约定）
+        {"text": "Lunch at 12?", "label": "LEGIT"}             # or "0"/"1"
+      ]
+    }
+    Returns accuracy and per-class counts (labels normalized to PHISH/LEGIT).
+    """
     try:
         texts = [s.text for s in payload.samples]
+        # 这里用数据集映射把 "0"/"1" 转成人类可读的 PHISH/LEGIT
+        gts = [_normalize_label_from_dataset(s.label) if s.label is not None else None for s in payload.samples]
+        preds = _predict_texts(texts)
         total = len(preds)
         correct = 0
         per_class: Dict[str, Dict[str, int]] = {}
         for gt, pr in zip(gts, preds):
+            pred_label = pr["label"] if pr["label"] in ("PHISH", "LEGIT") else None
+            if gt is not None and pred_label is not None:
+                correct += int(gt == pred_label)
                 per_class.setdefault(gt, {"tp": 0, "count": 0})
                 per_class[gt]["count"] += 1
+                if gt == pred_label:
                     per_class[gt]["tp"] += 1
         has_gts = any(gt is not None for gt in gts)
         denom = sum(1 for gt in gts if gt is not None)
         acc = (correct / denom) if (has_gts and denom > 0) else None
+        return {
+            "accuracy": acc,            # None if no ground truths provided
+            "total": total,
+            "predictions": preds,
+            "per_class": per_class,
+            "dataset_mapping": {
+                "PHISH_VALUE": DATASET_PHISH_VALUE,
+                "LEGIT_VALUE": DATASET_LEGIT_VALUE,
+            },
+        }
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Evaluation error: {e}")
 if __name__ == "__main__":
+    # Run:  uvicorn app:app --host 0.0.0.0 --port 8000 --reload
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)