Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 3, 2025

Commit

c4d989f

verified ·

1 Parent(s): 5bdaec2

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -96

app.py CHANGED Viewed

@@ -6,55 +6,42 @@ from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-# Prefer MODEL_ID, fall back to HF_MODEL_ID, then default
-MODEL_ID = (
-    os.environ.get("MODEL_ID")
-    or os.environ.get("HF_MODEL_ID")
-    or "Perth0603/phishing-email-mobilebert"
-)
-# Optional: force mapping when model labels are unclear (binary only).
-# Example values:
-#   FORCE_BINARY_MAPPING="LEGIT,PHISH"  (index0=LEGIT, index1=PHISH)
-#   FORCE_BINARY_MAPPING="PHISH,LEGIT"  (index0=PHISH, index1=LEGIT)
-FORCE_BINARY_MAPPING = os.environ.get("FORCE_BINARY_MAPPING", "").strip().upper()
-app = FastAPI(title="Phishing Text Classifier", version="1.3.0")
-# ---------- Schemas ----------
 class PredictPayload(BaseModel):
     inputs: str
 class BatchPredictPayload(BaseModel):
     inputs: List[str]
 class LabeledText(BaseModel):
     text: str
-    label: Optional[str] = None  # optional ground truth ("0"/"1" or text)
 class EvalPayload(BaseModel):
     samples: List[LabeledText]
-# ---------- Globals / cache ----------
 _tokenizer = None
 _model = None
 _device = "cpu"
-# Cached normalized mapping/meta
-_IDX_PHISH: Optional[int] = None           # model output index that corresponds to PHISH
-_IDX_LEGIT: Optional[int] = None           # model output index that corresponds to LEGIT
-_NORM_LABELS_BY_IDX: Optional[List[str]] = None  # normalized labels ordered by model indices
-_USED_FORCED_MAPPING: bool = False         # whether FORCE_BINARY_MAPPING took effect
-# ---------- Helpers ----------
 def _normalize_label(txt: str) -> str:
-    """Normalize common variants and accept "0"/"1" from CSVs."""
     t = (str(txt) if txt is not None else "").strip().upper()
     if t in ("1", "PHISHING", "PHISH", "SPAM"):
         return "PHISH"
@@ -62,48 +49,38 @@ def _normalize_label(txt: str) -> str:
         return "LEGIT"
     return t
 def _try_force_binary_mapping(num_labels: int) -> Tuple[Optional[int], Optional[int], bool]:
-    """Apply FORCE_BINARY_MAPPING env var if provided and binary."""
     if num_labels != 2 or not FORCE_BINARY_MAPPING:
         return None, None, False
     parts = [p.strip() for p in FORCE_BINARY_MAPPING.split(",") if p.strip()]
     if len(parts) != 2 or any(p not in ("PHISH", "LEGIT") for p in parts):
         return None, None, False
-    # parts[0] is index 0, parts[1] is index 1
     idx_legit = 0 if parts[0] == "LEGIT" else 1 if parts[1] == "LEGIT" else None
     idx_phish = 0 if parts[0] == "PHISH" else 1 if parts[1] == "PHISH" else None
     if idx_legit is None or idx_phish is None:
         return None, None, False
     return idx_phish, idx_legit, True
 def _load_model():
-    """Load model/tokenizer and derive stable label mapping."""
     global _tokenizer, _model, _device, _IDX_PHISH, _IDX_LEGIT, _NORM_LABELS_BY_IDX, _USED_FORCED_MAPPING
     if _tokenizer is not None and _model is not None:
         return
     _device = "cuda" if torch.cuda.is_available() else "cpu"
-    _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
     _model.to(_device)
     _model.eval()
-    # Warm-up
     with torch.no_grad():
         _ = _model(
-            **_tokenizer(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=512)
-            .to(_device)
         ).logits
-    # Derive normalized labels per index
     id2label = getattr(_model.config, "id2label", {}) or {}
     num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
     _NORM_LABELS_BY_IDX = [_normalize_label(id2label.get(i, f"LABEL_{i}")) for i in range(num_labels)]
-    # 1) Try explicit indices from normalized labels
     _IDX_PHISH = None
     _IDX_LEGIT = None
     try:
@@ -115,94 +92,73 @@ def _load_model():
     except ValueError:
         pass
-    # 2) If still unknown and binary, allow forced mapping
     _USED_FORCED_MAPPING = False
     if (_IDX_PHISH is None or _IDX_LEGIT is None) and num_labels == 2:
         fp, fl, used = _try_force_binary_mapping(num_labels)
         if used:
             _IDX_PHISH, _IDX_LEGIT = fp, fl
             _USED_FORCED_MAPPING = True
-    # 3) If still unknown, we keep them None and ONLY return model-native labels.
-    #    （不进行臆测，避免再次搞反）
-def _postprocess_batch_logits(texts: List[str]) -> List[Dict]:
-    """Compute predictions + provide robust, unambiguous fields for UI."""
     _load_model()
     if not texts:
         return []
-    enc = _tokenizer(
-        texts,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=512,
-    )
     enc = {k: v.to(_device) for k, v in enc.items()}
     with torch.no_grad():
         logits = _model(**enc).logits
-        probs = torch.softmax(logits, dim=-1)  # [batch, num_labels]
     id2label = getattr(_model.config, "id2label", {}) or {}
     labels_by_idx_raw = [id2label.get(i, f"LABEL_{i}") for i in range(probs.shape[-1])]
     labels_by_idx_norm = [_normalize_label(x) for x in labels_by_idx_raw]
-    outputs: List[Dict] = []
     for i in range(probs.shape[0]):
         p = probs[i]
         idx = int(torch.argmax(p).item())
         raw_label = labels_by_idx_raw[idx]
         norm_label = labels_by_idx_norm[idx]
-        # normalized probs dict
         prob_map = {labels_by_idx_norm[j]: float(p[j].item()) for j in range(len(labels_by_idx_norm))}
-        # Default display (robust): if我们能确定 PHISH/LEGIT 下标，就用它；否则用norm_label回退
-        can_map_dataset = (_IDX_PHISH is not None and _IDX_LEGIT is not None)
-        if can_map_dataset:
             phish_prob = float(p[_IDX_PHISH].item())
             legit_prob = float(p[_IDX_LEGIT].item())
             is_phish = phish_prob >= legit_prob
-            dataset_label = "1" if is_phish else "0"  # 按你的数据集约定：1=PHISH, 0=LEGIT
             display_label = "phishing" if is_phish else "legitimate"
             probs_by_dataset = {"1": phish_prob, "0": legit_prob}
         else:
-            # 回退策略：用当前pred的规范化标签
             is_phish = (norm_label == "PHISH")
             dataset_label = "1" if is_phish else "0"
             display_label = "phishing" if is_phish else "legitimate"
-            probs_by_dataset = None  # unknown mapping
-        outputs.append(
-            {
-                # —— 建议前端优先使用这三个字段，不会搞反 ——
-                "is_phish": is_phish,
-                "dataset_label": dataset_label,   # "1"=PHISH, "0"=LEGIT
-                "display_label": display_label,   # "phishing"/"legitimate"
-                # —— 诊断/兼容字段 ——
-                "label": norm_label,              # 规范化后的（PHISH/LEGIT/未知）
-                "raw_label": raw_label,           # 来自 model.config.id2label
-                "score": float(p[idx].item()),    # argmax 概率
-                "probs": prob_map,                # 规范化名 -> 概率
-                "predicted_index": idx,           # 模型 argmax 下标
-                "predicted_dataset_label": (1 if is_phish else 0),  # int，等价于上面的字符串
-                "probs_by_dataset_label": probs_by_dataset,
-            }
-        )
-    return outputs
-# ---------- Routes ----------
 @app.get("/")
 def root():
-    return {"status": "ok", "model": MODEL_ID}
 @app.get("/debug/labels")
 def debug_labels():
@@ -217,7 +173,6 @@ def debug_labels():
         "idx_legit": _IDX_LEGIT,
     }
 @app.get("/debug/mapping")
 def debug_mapping():
     _load_model()
@@ -231,17 +186,49 @@ def debug_mapping():
         "idx_legit": _IDX_LEGIT,
     }
 @app.post("/predict")
 def predict(payload: PredictPayload):
     try:
-        res = _postprocess_batch_logits([payload.inputs])
-        return res[0]
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Prediction error: {e}")
 @app.post("/predict-batch")
 def predict_batch(payload: BatchPredictPayload):
     try:
-        return _postprocess_batch_logits(payload.inp_

 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# ====== 模型来源 ======
+# 默认从本地目录加载（你上传的文件在 /mnt/data）
+MODEL_DIR = os.environ.get("MODEL_DIR", "/mnt/data")
+# 可选：当模型没写清标签且为二分类时，强制指定顺序（这里通常不需要）
+# 例：FORCE_BINARY_MAPPING="LEGIT,PHISH" 或 "PHISH,LEGIT"
+FORCE_BINARY_MAPPING = os.environ.get("FORCE_BINARY_MAPPING", "").strip().upper()
+app = FastAPI(title="Phishing Text Classifier", version="1.3.1")
+# ====== Schemas ======
 class PredictPayload(BaseModel):
     inputs: str
 class BatchPredictPayload(BaseModel):
     inputs: List[str]
 class LabeledText(BaseModel):
     text: str
+    label: Optional[str] = None  # "0"/"1" 或文本
 class EvalPayload(BaseModel):
     samples: List[LabeledText]
+# ====== Globals ======
 _tokenizer = None
 _model = None
 _device = "cpu"
+_IDX_PHISH: Optional[int] = None
+_IDX_LEGIT: Optional[int] = None
+_NORM_LABELS_BY_IDX: Optional[List[str]] = None
+_USED_FORCED_MAPPING: bool = False
+# ====== Helpers ======
 def _normalize_label(txt: str) -> str:
     t = (str(txt) if txt is not None else "").strip().upper()
     if t in ("1", "PHISHING", "PHISH", "SPAM"):
         return "PHISH"
         return "LEGIT"
     return t
 def _try_force_binary_mapping(num_labels: int) -> Tuple[Optional[int], Optional[int], bool]:
     if num_labels != 2 or not FORCE_BINARY_MAPPING:
         return None, None, False
     parts = [p.strip() for p in FORCE_BINARY_MAPPING.split(",") if p.strip()]
     if len(parts) != 2 or any(p not in ("PHISH", "LEGIT") for p in parts):
         return None, None, False
     idx_legit = 0 if parts[0] == "LEGIT" else 1 if parts[1] == "LEGIT" else None
     idx_phish = 0 if parts[0] == "PHISH" else 1 if parts[1] == "PHISH" else None
     if idx_legit is None or idx_phish is None:
         return None, None, False
     return idx_phish, idx_legit, True
 def _load_model():
     global _tokenizer, _model, _device, _IDX_PHISH, _IDX_LEGIT, _NORM_LABELS_BY_IDX, _USED_FORCED_MAPPING
     if _tokenizer is not None and _model is not None:
         return
     _device = "cuda" if torch.cuda.is_available() else "cpu"
+    _tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
+    _model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
     _model.to(_device)
     _model.eval()
     with torch.no_grad():
         _ = _model(
+            **_tokenizer(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=512).to(_device)
         ).logits
     id2label = getattr(_model.config, "id2label", {}) or {}
     num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
     _NORM_LABELS_BY_IDX = [_normalize_label(id2label.get(i, f"LABEL_{i}")) for i in range(num_labels)]
     _IDX_PHISH = None
     _IDX_LEGIT = None
     try:
     except ValueError:
         pass
     _USED_FORCED_MAPPING = False
     if (_IDX_PHISH is None or _IDX_LEGIT is None) and num_labels == 2:
         fp, fl, used = _try_force_binary_mapping(num_labels)
         if used:
             _IDX_PHISH, _IDX_LEGIT = fp, fl
             _USED_FORCED_MAPPING = True
+    # 你的模型文件已经写明：0=LEGIT, 1=PHISH，通常这里会自动识别出来。
+def _postprocess(texts: List[str]) -> List[Dict]:
     _load_model()
     if not texts:
         return []
+    enc = _tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
     enc = {k: v.to(_device) for k, v in enc.items()}
     with torch.no_grad():
         logits = _model(**enc).logits
+        probs = torch.softmax(logits, dim=-1)
     id2label = getattr(_model.config, "id2label", {}) or {}
     labels_by_idx_raw = [id2label.get(i, f"LABEL_{i}") for i in range(probs.shape[-1])]
     labels_by_idx_norm = [_normalize_label(x) for x in labels_by_idx_raw]
+    outs: List[Dict] = []
     for i in range(probs.shape[0]):
         p = probs[i]
         idx = int(torch.argmax(p).item())
         raw_label = labels_by_idx_raw[idx]
         norm_label = labels_by_idx_norm[idx]
         prob_map = {labels_by_idx_norm[j]: float(p[j].item()) for j in range(len(labels_by_idx_norm))}
+        # —— 核心：用明确的下标来给出“数据集标签”和“UI标签”
+        can_map = (_IDX_PHISH is not None and _IDX_LEGIT is not None)
+        if can_map:
             phish_prob = float(p[_IDX_PHISH].item())
             legit_prob = float(p[_IDX_LEGIT].item())
             is_phish = phish_prob >= legit_prob
+            dataset_label = "1" if is_phish else "0"        # 1=PHISH, 0=LEGIT
             display_label = "phishing" if is_phish else "legitimate"
             probs_by_dataset = {"1": phish_prob, "0": legit_prob}
         else:
+            # 回退：用规范化标签
             is_phish = (norm_label == "PHISH")
             dataset_label = "1" if is_phish else "0"
             display_label = "phishing" if is_phish else "legitimate"
+            probs_by_dataset = None
+        outs.append({
+            "is_phish": is_phish,                     # 前端用它来显示
+            "dataset_label": dataset_label,           # "1"=PHISH, "0"=LEGIT
+            "display_label": display_label,           # "phishing"/"legitimate"
+            "label": norm_label,                      # 规范化（兼容/排错）
+            "raw_label": raw_label,                   # 原始模型标签
+            "score": float(p[idx].item()),
+            "probs": prob_map,
+            "predicted_index": idx,
+            "predicted_dataset_label": 1 if is_phish else 0,
+            "probs_by_dataset_label": probs_by_dataset,
+        })
+    return outs
+# ====== Routes ======
 @app.get("/")
 def root():
+    return {"status": "ok", "model_dir": MODEL_DIR}
 @app.get("/debug/labels")
 def debug_labels():
         "idx_legit": _IDX_LEGIT,
     }
 @app.get("/debug/mapping")
 def debug_mapping():
     _load_model()
         "idx_legit": _IDX_LEGIT,
     }
 @app.post("/predict")
 def predict(payload: PredictPayload):
     try:
+        return _postprocess([payload.inputs])[0]
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Prediction error: {e}")
 @app.post("/predict-batch")
 def predict_batch(payload: BatchPredictPayload):
     try:
+        return _postprocess(payload.inputs)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Batch prediction error: {e}")
+@app.post("/evaluate")
+def evaluate(payload: EvalPayload):
+    try:
+        texts = [s.text for s in payload.samples]
+        gts = [(_normalize_label(s.label) if s.label is not None else None) for s in payload.samples]
+        preds = _postprocess(texts)
+        total = len(preds)
+        correct = 0
+        per_class: Dict[str, Dict[str, int]] = {}
+        for gt, pr in zip(gts, preds):
+            pred_norm = "PHISH" if pr["is_phish"] else "LEGIT"
+            if gt is not None:
+                correct += int(gt == pred_norm)
+                per_class.setdefault(gt, {"tp": 0, "count": 0})
+                per_class[gt]["count"] += 1
+                if gt == pred_norm:
+                    per_class[gt]["tp"] += 1
+        has_gts = any(gt is not None for gt in gts)
+        denom = sum(1 for gt in gts if gt is not None)
+        acc = (correct / denom) if (has_gts and denom > 0) else None
+        return {"accuracy": acc, "total": total, "predictions": preds, "per_class": per_class}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Evaluation error: {e}")
+if __name__ == "__main__":
+    # 启动：uvicorn app:app --host 0.0.0.0 --port 8000 --reload
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)