Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 8, 2025

Commit

7e1eb79

verified ·

1 Parent(s): 6823e29

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -85

app.py CHANGED Viewed

@@ -126,59 +126,11 @@ _tokenizer = None
 _model = None
 _device = "cpu"
 _preprocessor = None
-_LABEL_MAPPING = None
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
-def _get_label_mapping():
-    """
-    Get complete label mapping.
-    If model config is incomplete, use fallback mapping.
-    """
-    global _model, _LABEL_MAPPING
-    if _model is None:
-        return None
-    id2label = getattr(_model.config, "id2label", {}) or {}
-    # Check if mapping is incomplete (missing label 0)
-    num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
-    print(f"DEBUG: num_labels = {num_labels}")
-    print(f"DEBUG: id2label from config = {id2label}")
-    # If incomplete, use fallback
-    if len(id2label) < num_labels:
-        print(f"WARNING: Incomplete label mapping detected!")
-        print(f"Expected {num_labels} labels, got {len(id2label)}")
-        # Try to load from labels.json if available
-        try:
-            import pkg_resources
-            model_path = pkg_resources.resource_filename(__name__, 'models')
-            labels_path = os.path.join(model_path, 'labels.json')
-            if os.path.exists(labels_path):
-                with open(labels_path, 'r') as f:
-                    labels_data = json.load(f)
-                    id2label = labels_data.get("id2label", {})
-                    print(f"Loaded labels from labels.json: {id2label}")
-        except:
-            pass
-        # Final fallback mapping
-        if len(id2label) < 2:
-            print("Using fallback label mapping: 0=LEGIT, 1=PHISH")
-            id2label = {
-                "0": "LEGIT",
-                "1": "PHISH"
-            }
-    return id2label
 def _normalize_label(txt: str) -> str:
     """Normalize label text"""
     t = (str(txt) if txt is not None else "").strip().upper()
@@ -191,7 +143,7 @@ def _normalize_label(txt: str) -> str:
 def _load_model():
     """Load model, tokenizer, and preprocessor"""
-    global _tokenizer, _model, _device, _preprocessor, _LABEL_MAPPING
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -206,9 +158,6 @@ def _load_model():
         _model.eval()
         _preprocessor = TextPreprocessor()
-        # Get label mapping
-        _LABEL_MAPPING = _get_label_mapping()
         # Warm-up
         with torch.no_grad():
             _ = _model(
@@ -217,14 +166,17 @@ def _load_model():
             ).logits
         num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
         print(f"Number of labels: {num_labels}")
-        print(f"Label mapping: {_LABEL_MAPPING}")
         print(f"{'='*60}\n")
 def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
     """
-    Predict with corrected label mapping
     """
     _load_model()
     if not texts:
@@ -250,39 +202,43 @@ def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List
         logits = _model(**enc).logits
         probs = torch.softmax(logits, dim=-1)
-    # Build label list from mapping
-    num_labels = probs.shape[-1]
-    labels_by_idx = []
-    for i in range(num_labels):
-        label = _LABEL_MAPPING.get(str(i), f"LABEL_{i}")
-        labels_by_idx.append(label)
-    print(f"DEBUG: Using labels: {labels_by_idx}")
     outputs: List[Dict] = []
     for i in range(probs.shape[0]):
         p = probs[i]
-        idx = int(torch.argmax(p).item())
-        raw_label = labels_by_idx[idx]
-        norm_label = _normalize_label(raw_label)
-        # Build probability map
-        prob_map: Dict[str, float] = {}
-        for j in range(len(labels_by_idx)):
-            label_norm = _normalize_label(labels_by_idx[j])
-            prob_map[label_norm] = float(p[j].item())
         output = {
             "text": texts[i][:100] + "..." if len(texts[i]) > 100 else texts[i],
-            "label": norm_label,
-            "raw_label": raw_label,
-            "is_phish": norm_label == "PHISH",
-            "score": round(float(p[idx].item()), 4),
-            "confidence": round(float(p[idx].item()), 4),
-            "predicted_index": idx,
-            "probs": {k: round(v, 4) for k, v in prob_map.items()},
-            "all_probs_raw": [round(float(p[j].item()), 4) for j in range(len(labels_by_idx))],
         }
         if include_preprocessing and preprocessing_info:
@@ -305,13 +261,17 @@ def root():
         "status": "ok",
         "model": MODEL_ID,
         "device": _device,
-        "label_mapping": _LABEL_MAPPING,
     }
 @app.get("/debug/labels")
 def debug_labels():
-    """View complete model configuration"""
     _load_model()
     id2label_raw = getattr(_model.config, "id2label", {}) or {}
@@ -323,9 +283,11 @@ def debug_labels():
         "config_id2label": id2label_raw,
         "config_label2id": label2id_raw,
         "config_num_labels": num_labels,
-        "applied_label_mapping": _LABEL_MAPPING,
-        "device": _device,
-        "note": "If config_id2label is incomplete, applied_label_mapping is used"
     }

 _model = None
 _device = "cpu"
 _preprocessor = None
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
 def _normalize_label(txt: str) -> str:
     """Normalize label text"""
     t = (str(txt) if txt is not None else "").strip().upper()
 def _load_model():
     """Load model, tokenizer, and preprocessor"""
+    global _tokenizer, _model, _device, _preprocessor
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
         _model.eval()
         _preprocessor = TextPreprocessor()
         # Warm-up
         with torch.no_grad():
             _ = _model(
             ).logits
         num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
+        id2label = getattr(_model.config, "id2label", {}) or {}
         print(f"Number of labels: {num_labels}")
+        print(f"Label mapping: {id2label}")
         print(f"{'='*60}\n")
 def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
     """
+    Predict with CORRECT label indexing.
+    Index 0 = LEGIT, Index 1 = PHISH
     """
     _load_model()
     if not texts:
         logits = _model(**enc).logits
         probs = torch.softmax(logits, dim=-1)
+    # CORRECT LABEL MAPPING
+    # Index 0 = LEGIT (probs[i][0])
+    # Index 1 = PHISH (probs[i][1])
+    labels_by_idx = ["LEGIT", "PHISH"]
     outputs: List[Dict] = []
     for i in range(probs.shape[0]):
         p = probs[i]
+        # Get probabilities for each class
+        prob_legit = float(p[0].item())
+        prob_phish = float(p[1].item())
+        # Determine prediction based on which is higher
+        if prob_phish > prob_legit:
+            predicted_label = "PHISH"
+            predicted_idx = 1
+            confidence = prob_phish
+        else:
+            predicted_label = "LEGIT"
+            predicted_idx = 0
+            confidence = prob_legit
         output = {
             "text": texts[i][:100] + "..." if len(texts[i]) > 100 else texts[i],
+            "label": predicted_label,
+            "is_phish": predicted_label == "PHISH",
+            "confidence": round(confidence * 100, 2),  # Convert to percentage
+            "predicted_index": predicted_idx,
+            "probs": {
+                "LEGIT": round(prob_legit * 100, 2),
+                "PHISH": round(prob_phish * 100, 2),
+            },
+            "raw_probs": {
+                "LEGIT (index 0)": round(prob_legit, 4),
+                "PHISH (index 1)": round(prob_phish, 4),
+            }
         }
         if include_preprocessing and preprocessing_info:
         "status": "ok",
         "model": MODEL_ID,
         "device": _device,
+        "label_mapping": {
+            "0": "LEGIT",
+            "1": "PHISH"
+        },
+        "note": "Index 0 = LEGIT (probability%), Index 1 = PHISH (probability%)"
     }
 @app.get("/debug/labels")
 def debug_labels():
+    """View model configuration"""
     _load_model()
     id2label_raw = getattr(_model.config, "id2label", {}) or {}
         "config_id2label": id2label_raw,
         "config_label2id": label2id_raw,
         "config_num_labels": num_labels,
+        "applied_mapping": {
+            "0": "LEGIT",
+            "1": "PHISH"
+        },
+        "device": _device
     }