Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 8, 2025

Commit

6823e29

verified ·

1 Parent(s): 3a83600

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -46

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import List, Optional, Dict
 import re
 import torch
 import nltk
@@ -30,7 +31,7 @@ app = FastAPI(title="Phishing Text Classifier with Preprocessing", version="1.0.
 # ============================================================================
-# TEXT PREPROCESSING CLASS (FOR ANALYSIS ONLY, NOT FOR MODEL INPUT)
 # ============================================================================
 class TextPreprocessor:
     """NLP preprocessing for analysis and feature extraction"""
@@ -78,7 +79,7 @@ class TextPreprocessor:
         }
     def preprocess(self, text: str) -> Dict:
-        """Preprocessing for analysis (NOT for model)"""
         tokens = self.tokenize(text)
         tokens_no_stop = self.remove_stopwords(tokens)
         stemmed = self.stem(tokens_no_stop)
@@ -125,29 +126,79 @@ _tokenizer = None
 _model = None
 _device = "cpu"
 _preprocessor = None
-_NORM_LABELS_BY_IDX = None
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
-def _normalize_label_text_only(txt: str) -> str:
-    """Normalize model label text"""
     t = (str(txt) if txt is not None else "").strip().upper()
-    if t in ("PHISHING", "PHISH", "SPAM"):
         return "PHISH"
-    if t in ("LEGIT", "LEGITIMATE", "SAFE", "HAM"):
         return "LEGIT"
     return t
 def _load_model():
     """Load model, tokenizer, and preprocessor"""
-    global _tokenizer, _model, _device, _NORM_LABELS_BY_IDX, _preprocessor
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Loading model on device: {_device}")
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
@@ -155,6 +206,9 @@ def _load_model():
         _model.eval()
         _preprocessor = TextPreprocessor()
         # Warm-up
         with torch.no_grad():
             _ = _model(
@@ -162,36 +216,28 @@ def _load_model():
                 .to(_device)
             ).logits
-        # Read and normalize model labels
-        id2label = getattr(_model.config, "id2label", {}) or {}
         num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
-        _NORM_LABELS_BY_IDX = [_normalize_label_text_only(id2label.get(i, f"LABEL_{i}")) for i in range(num_labels)]
-        print(f"Model loaded successfully")
-        print(f"ID2Label: {id2label}")
-        print(f"Normalized labels: {_NORM_LABELS_BY_IDX}")
 def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
     """
-    Predict using ORIGINAL text (NO cleaning).
-    Preprocessing is for ANALYSIS only, not for model input.
     """
     _load_model()
     if not texts:
         return []
-    # IMPORTANT: Use original text for model, NOT cleaned text!
-    model_inputs = texts
-    # Get preprocessing info for analysis
     preprocessing_info = None
     if include_preprocessing:
         preprocessing_info = [_preprocessor.preprocess(text) for text in texts]
-    # Tokenize batch for model
     enc = _tokenizer(
-        model_inputs,
         return_tensors="pt",
         padding=True,
         truncation=True,
@@ -199,36 +245,44 @@ def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List
     )
     enc = {k: v.to(_device) for k, v in enc.items()}
-    # Get predictions
     with torch.no_grad():
         logits = _model(**enc).logits
         probs = torch.softmax(logits, dim=-1)
-    id2label = getattr(_model.config, "id2label", None) or {}
-    labels_by_idx_raw = [id2label.get(i, f"LABEL_{i}") for i in range(probs.shape[-1])]
-    labels_by_idx_norm = [_normalize_label_text_only(lbl) for lbl in labels_by_idx_raw]
     outputs: List[Dict] = []
     for i in range(probs.shape[0]):
         p = probs[i]
         idx = int(torch.argmax(p).item())
-        raw_label = labels_by_idx_raw[idx]
-        norm_label = labels_by_idx_norm[idx]
         prob_map: Dict[str, float] = {}
-        for j, lbl_norm in enumerate(labels_by_idx_norm):
-            key = lbl_norm if lbl_norm in ("PHISH", "LEGIT") else f"CLASS_{j}"
-            prob_map[key] = float(p[j].item())
         output = {
             "label": norm_label,
             "raw_label": raw_label,
-            "is_phish": True if norm_label == "PHISH" else False,
             "score": round(float(p[idx].item()), 4),
             "confidence": round(float(p[idx].item()), 4),
-            "probs": {k: round(v, 4) for k, v in prob_map.items()},
             "predicted_index": idx,
         }
         if include_preprocessing and preprocessing_info:
@@ -251,26 +305,33 @@ def root():
         "status": "ok",
         "model": MODEL_ID,
         "device": _device,
-        "note": "Model uses ORIGINAL text for predictions. Preprocessing is for analysis only.",
     }
 @app.get("/debug/labels")
 def debug_labels():
-    """View model configuration"""
     _load_model()
     return {
-        "id2label": getattr(_model.config, "id2label", {}),
-        "label2id": getattr(_model.config, "label2id", {}),
-        "num_labels": int(getattr(_model.config, "num_labels", 0)),
         "device": _device,
-        "norm_labels_by_idx": _NORM_LABELS_BY_IDX,
     }
 @app.post("/debug/preprocessing")
 def debug_preprocessing(payload: PredictPayload):
-    """Debug preprocessing output"""
     try:
         _load_model()
         preprocessing = _preprocessor.preprocess(payload.inputs)
@@ -279,7 +340,7 @@ def debug_preprocessing(payload: PredictPayload):
             "preprocessing": preprocessing
         }
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Preprocessing error: {e}")
 @app.post("/predict")
@@ -289,7 +350,7 @@ def predict(payload: PredictPayload):
         res = _predict_texts([payload.inputs], include_preprocessing=payload.include_preprocessing)
         return res[0]
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Prediction error: {e}")
 @app.post("/predict-batch")
@@ -298,7 +359,7 @@ def predict_batch(payload: BatchPredictPayload):
     try:
         return _predict_texts(payload.inputs, include_preprocessing=payload.include_preprocessing)
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Batch prediction error: {e}")
 @app.post("/evaluate")
@@ -306,7 +367,7 @@ def evaluate(payload: EvalPayload):
     """Evaluate on labeled samples"""
     try:
         texts = [s.text for s in payload.samples]
-        gts = [(_normalize_label_text_only(s.label) if s.label is not None else None) for s in payload.samples]
         preds = _predict_texts(texts, include_preprocessing=False)
         total = len(preds)
@@ -333,7 +394,7 @@ def evaluate(payload: EvalPayload):
             "per_class": per_class,
         }
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Evaluation error: {e}")
 if __name__ == "__main__":

 import os
 from typing import List, Optional, Dict
 import re
+import json
 import torch
 import nltk
 # ============================================================================
+# TEXT PREPROCESSING CLASS
 # ============================================================================
 class TextPreprocessor:
     """NLP preprocessing for analysis and feature extraction"""
         }
     def preprocess(self, text: str) -> Dict:
+        """Preprocessing for analysis"""
         tokens = self.tokenize(text)
         tokens_no_stop = self.remove_stopwords(tokens)
         stemmed = self.stem(tokens_no_stop)
 _model = None
 _device = "cpu"
 _preprocessor = None
+_LABEL_MAPPING = None
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
+def _get_label_mapping():
+    """
+    Get complete label mapping.
+    If model config is incomplete, use fallback mapping.
+    """
+    global _model, _LABEL_MAPPING
+    if _model is None:
+        return None
+    id2label = getattr(_model.config, "id2label", {}) or {}
+    # Check if mapping is incomplete (missing label 0)
+    num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
+    print(f"DEBUG: num_labels = {num_labels}")
+    print(f"DEBUG: id2label from config = {id2label}")
+    # If incomplete, use fallback
+    if len(id2label) < num_labels:
+        print(f"WARNING: Incomplete label mapping detected!")
+        print(f"Expected {num_labels} labels, got {len(id2label)}")
+        # Try to load from labels.json if available
+        try:
+            import pkg_resources
+            model_path = pkg_resources.resource_filename(__name__, 'models')
+            labels_path = os.path.join(model_path, 'labels.json')
+            if os.path.exists(labels_path):
+                with open(labels_path, 'r') as f:
+                    labels_data = json.load(f)
+                    id2label = labels_data.get("id2label", {})
+                    print(f"Loaded labels from labels.json: {id2label}")
+        except:
+            pass
+        # Final fallback mapping
+        if len(id2label) < 2:
+            print("Using fallback label mapping: 0=LEGIT, 1=PHISH")
+            id2label = {
+                "0": "LEGIT",
+                "1": "PHISH"
+            }
+    return id2label
+def _normalize_label(txt: str) -> str:
+    """Normalize label text"""
     t = (str(txt) if txt is not None else "").strip().upper()
+    if t in ("PHISHING", "PHISH", "SPAM", "1"):
         return "PHISH"
+    if t in ("LEGIT", "LEGITIMATE", "SAFE", "HAM", "0"):
         return "LEGIT"
     return t
 def _load_model():
     """Load model, tokenizer, and preprocessor"""
+    global _tokenizer, _model, _device, _preprocessor, _LABEL_MAPPING
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"\n{'='*60}")
         print(f"Loading model on device: {_device}")
+        print(f"Model ID: {MODEL_ID}")
+        print(f"{'='*60}\n")
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
         _model.eval()
         _preprocessor = TextPreprocessor()
+        # Get label mapping
+        _LABEL_MAPPING = _get_label_mapping()
         # Warm-up
         with torch.no_grad():
             _ = _model(
                 .to(_device)
             ).logits
         num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
+        print(f"Number of labels: {num_labels}")
+        print(f"Label mapping: {_LABEL_MAPPING}")
+        print(f"{'='*60}\n")
 def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
     """
+    Predict with corrected label mapping
     """
     _load_model()
     if not texts:
         return []
+    # Get preprocessing info
     preprocessing_info = None
     if include_preprocessing:
         preprocessing_info = [_preprocessor.preprocess(text) for text in texts]
+    # Tokenize
     enc = _tokenizer(
+        texts,
         return_tensors="pt",
         padding=True,
         truncation=True,
     )
     enc = {k: v.to(_device) for k, v in enc.items()}
+    # Predict
     with torch.no_grad():
         logits = _model(**enc).logits
         probs = torch.softmax(logits, dim=-1)
+    # Build label list from mapping
+    num_labels = probs.shape[-1]
+    labels_by_idx = []
+    for i in range(num_labels):
+        label = _LABEL_MAPPING.get(str(i), f"LABEL_{i}")
+        labels_by_idx.append(label)
+    print(f"DEBUG: Using labels: {labels_by_idx}")
     outputs: List[Dict] = []
     for i in range(probs.shape[0]):
         p = probs[i]
         idx = int(torch.argmax(p).item())
+        raw_label = labels_by_idx[idx]
+        norm_label = _normalize_label(raw_label)
+        # Build probability map
         prob_map: Dict[str, float] = {}
+        for j in range(len(labels_by_idx)):
+            label_norm = _normalize_label(labels_by_idx[j])
+            prob_map[label_norm] = float(p[j].item())
         output = {
+            "text": texts[i][:100] + "..." if len(texts[i]) > 100 else texts[i],
             "label": norm_label,
             "raw_label": raw_label,
+            "is_phish": norm_label == "PHISH",
             "score": round(float(p[idx].item()), 4),
             "confidence": round(float(p[idx].item()), 4),
             "predicted_index": idx,
+            "probs": {k: round(v, 4) for k, v in prob_map.items()},
+            "all_probs_raw": [round(float(p[j].item()), 4) for j in range(len(labels_by_idx))],
         }
         if include_preprocessing and preprocessing_info:
         "status": "ok",
         "model": MODEL_ID,
         "device": _device,
+        "label_mapping": _LABEL_MAPPING,
     }
 @app.get("/debug/labels")
 def debug_labels():
+    """View complete model configuration"""
     _load_model()
+    id2label_raw = getattr(_model.config, "id2label", {}) or {}
+    label2id_raw = getattr(_model.config, "label2id", {}) or {}
+    num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
     return {
+        "status": "ok",
+        "config_id2label": id2label_raw,
+        "config_label2id": label2id_raw,
+        "config_num_labels": num_labels,
+        "applied_label_mapping": _LABEL_MAPPING,
         "device": _device,
+        "note": "If config_id2label is incomplete, applied_label_mapping is used"
     }
 @app.post("/debug/preprocessing")
 def debug_preprocessing(payload: PredictPayload):
+    """Debug preprocessing"""
     try:
         _load_model()
         preprocessing = _preprocessor.preprocess(payload.inputs)
             "preprocessing": preprocessing
         }
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error: {e}")
 @app.post("/predict")
         res = _predict_texts([payload.inputs], include_preprocessing=payload.include_preprocessing)
         return res[0]
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error: {e}")
 @app.post("/predict-batch")
     try:
         return _predict_texts(payload.inputs, include_preprocessing=payload.include_preprocessing)
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error: {e}")
 @app.post("/evaluate")
     """Evaluate on labeled samples"""
     try:
         texts = [s.text for s in payload.samples]
+        gts = [(_normalize_label(s.label) if s.label is not None else None) for s in payload.samples]
         preds = _predict_texts(texts, include_preprocessing=False)
         total = len(preds)
             "per_class": per_class,
         }
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error: {e}")
 if __name__ == "__main__":