Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 8, 2025

Commit

847316c

verified ·

1 Parent(s): 7e1eb79

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -54

app.py CHANGED Viewed

@@ -126,11 +126,47 @@ _tokenizer = None
 _model = None
 _device = "cpu"
 _preprocessor = None
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
 def _normalize_label(txt: str) -> str:
     """Normalize label text"""
     t = (str(txt) if txt is not None else "").strip().upper()
@@ -143,7 +179,7 @@ def _normalize_label(txt: str) -> str:
 def _load_model():
     """Load model, tokenizer, and preprocessor"""
-    global _tokenizer, _model, _device, _preprocessor
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -158,6 +194,9 @@ def _load_model():
         _model.eval()
         _preprocessor = TextPreprocessor()
         # Warm-up
         with torch.no_grad():
             _ = _model(
@@ -165,18 +204,13 @@ def _load_model():
                 .to(_device)
             ).logits
-        num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
-        id2label = getattr(_model.config, "id2label", {}) or {}
-        print(f"Number of labels: {num_labels}")
-        print(f"Label mapping: {id2label}")
         print(f"{'='*60}\n")
 def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
     """
-    Predict with CORRECT label indexing.
-    Index 0 = LEGIT, Index 1 = PHISH
     """
     _load_model()
     if not texts:
@@ -202,49 +236,50 @@ def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List
         logits = _model(**enc).logits
         probs = torch.softmax(logits, dim=-1)
-    # CORRECT LABEL MAPPING
-    # Index 0 = LEGIT (probs[i][0])
-    # Index 1 = PHISH (probs[i][1])
-    labels_by_idx = ["LEGIT", "PHISH"]
     outputs: List[Dict] = []
-    for i in range(probs.shape[0]):
-        p = probs[i]
-        # Get probabilities for each class
-        prob_legit = float(p[0].item())
-        prob_phish = float(p[1].item())
-        # Determine prediction based on which is higher
-        if prob_phish > prob_legit:
-            predicted_label = "PHISH"
-            predicted_idx = 1
-            confidence = prob_phish
-        else:
-            predicted_label = "LEGIT"
-            predicted_idx = 0
-            confidence = prob_legit
         output = {
-            "text": texts[i][:100] + "..." if len(texts[i]) > 100 else texts[i],
-            "label": predicted_label,
-            "is_phish": predicted_label == "PHISH",
-            "confidence": round(confidence * 100, 2),  # Convert to percentage
-            "predicted_index": predicted_idx,
-            "probs": {
-                "LEGIT": round(prob_legit * 100, 2),
-                "PHISH": round(prob_phish * 100, 2),
-            },
-            "raw_probs": {
-                "LEGIT (index 0)": round(prob_legit, 4),
-                "PHISH (index 1)": round(prob_phish, 4),
-            }
         }
         if include_preprocessing and preprocessing_info:
-            output["preprocessing"] = preprocessing_info[i]
         outputs.append(output)
     return outputs
@@ -261,17 +296,13 @@ def root():
         "status": "ok",
         "model": MODEL_ID,
         "device": _device,
-        "label_mapping": {
-            "0": "LEGIT",
-            "1": "PHISH"
-        },
-        "note": "Index 0 = LEGIT (probability%), Index 1 = PHISH (probability%)"
     }
 @app.get("/debug/labels")
 def debug_labels():
-    """View model configuration"""
     _load_model()
     id2label_raw = getattr(_model.config, "id2label", {}) or {}
@@ -280,14 +311,12 @@ def debug_labels():
     return {
         "status": "ok",
-        "config_id2label": id2label_raw,
-        "config_label2id": label2id_raw,
-        "config_num_labels": num_labels,
-        "applied_mapping": {
-            "0": "LEGIT",
-            "1": "PHISH"
-        },
-        "device": _device
     }

 _model = None
 _device = "cpu"
 _preprocessor = None
+_LABEL_MAPPING = None
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
+def _get_label_mapping():
+    """Get complete label mapping from model config"""
+    global _model
+    if _model is None:
+        return None
+    id2label = getattr(_model.config, "id2label", {}) or {}
+    num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
+    print(f"[DEBUG] Raw id2label from config: {id2label}")
+    print(f"[DEBUG] num_labels: {num_labels}")
+    # Build complete mapping by index
+    complete_mapping = {}
+    for i in range(num_labels):
+        if str(i) in id2label:
+            complete_mapping[i] = id2label[str(i)]
+        elif i in id2label:
+            complete_mapping[i] = id2label[i]
+        else:
+            complete_mapping[i] = f"LABEL_{i}"
+    # If incomplete, use fallback
+    if len(complete_mapping) < num_labels:
+        print(f"[WARNING] Incomplete mapping! Using fallback.")
+        complete_mapping = {
+            0: "LEGIT",
+            1: "PHISH"
+        }
+    print(f"[DEBUG] Complete mapping applied: {complete_mapping}")
+    return complete_mapping
 def _normalize_label(txt: str) -> str:
     """Normalize label text"""
     t = (str(txt) if txt is not None else "").strip().upper()
 def _load_model():
     """Load model, tokenizer, and preprocessor"""
+    global _tokenizer, _model, _device, _preprocessor, _LABEL_MAPPING
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
         _model.eval()
         _preprocessor = TextPreprocessor()
+        # Get label mapping
+        _LABEL_MAPPING = _get_label_mapping()
         # Warm-up
         with torch.no_grad():
             _ = _model(
                 .to(_device)
             ).logits
         print(f"{'='*60}\n")
 def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
     """
+    Predict with correct label index mapping
+    CRITICAL: probs[i][j] where j is the CLASS INDEX, not probability value
     """
     _load_model()
     if not texts:
         logits = _model(**enc).logits
         probs = torch.softmax(logits, dim=-1)
+    num_labels = probs.shape[-1]
+    print(f"\n[DEBUG] num_labels from probs shape: {num_labels}")
     outputs: List[Dict] = []
+    for text_idx in range(probs.shape[0]):
+        p = probs[text_idx]  # Get probabilities for this text: shape [num_labels]
+        # Create probability breakdown for ALL classes
+        prob_breakdown = {}
+        all_probs_list = []
+        for class_idx in range(num_labels):
+            class_prob = float(p[class_idx].item())
+            class_label = _LABEL_MAPPING.get(class_idx, f"CLASS_{class_idx}")
+            prob_breakdown[class_label] = round(class_prob, 4)
+            all_probs_list.append(class_prob)
+            print(f"[DEBUG] Class {class_idx} ({class_label}): {round(class_prob, 4)}")
+        # Get argmax index
+        predicted_idx = int(torch.argmax(p).item())
+        predicted_label_raw = _LABEL_MAPPING.get(predicted_idx, f"CLASS_{predicted_idx}")
+        predicted_label_norm = _normalize_label(predicted_label_raw)
+        predicted_prob = float(p[predicted_idx].item())
+        print(f"[DEBUG] ARGMAX: index={predicted_idx}, label={predicted_label_raw}, prob={round(predicted_prob, 4)}")
+        print(f"[DEBUG] Normalized label: {predicted_label_norm}")
         output = {
+            "text": texts[text_idx][:100] + "..." if len(texts[text_idx]) > 100 else texts[text_idx],
+            "predicted_class_index": predicted_idx,
+            "label": predicted_label_norm,
+            "raw_label": predicted_label_raw,
+            "is_phish": predicted_label_norm == "PHISH",
+            "score": round(predicted_prob, 4),
+            "confidence": round(predicted_prob * 100, 2),
+            "probs_by_class": prob_breakdown,
+            "all_probs_raw": [round(p_val, 4) for p_val in all_probs_list],
         }
         if include_preprocessing and preprocessing_info:
+            output["preprocessing"] = preprocessing_info[text_idx]
         outputs.append(output)
+        print(f"\n")
     return outputs
         "status": "ok",
         "model": MODEL_ID,
         "device": _device,
+        "label_mapping": _LABEL_MAPPING,
     }
 @app.get("/debug/labels")
 def debug_labels():
+    """View complete model configuration"""
     _load_model()
     id2label_raw = getattr(_model.config, "id2label", {}) or {}
     return {
         "status": "ok",
+        "model_config_id2label": id2label_raw,
+        "model_config_label2id": label2id_raw,
+        "model_config_num_labels": num_labels,
+        "applied_mapping": _LABEL_MAPPING,
+        "device": _device,
+        "note": "applied_mapping is what gets used for predictions"
     }