Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 8, 2025

Commit

b418015

verified ·

1 Parent(s): e17ff4f

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -90

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 from typing import List, Optional, Dict
 import re
-import json
 import torch
 import nltk
@@ -21,11 +20,8 @@ except LookupError:
     nltk.download('stopwords')
     nltk.download('wordnet')
-MODEL_ID = (
-    os.environ.get("MODEL_ID")
-    or os.environ.get("HF_MODEL_ID")
-    or "Perth0603/phishing-email-mobilebert"
-)
 app = FastAPI(title="Phishing Text Classifier with Preprocessing", version="1.0.0")
@@ -126,69 +122,11 @@ _tokenizer = None
 _model = None
 _device = "cpu"
 _preprocessor = None
-_LABEL_MAPPING = None
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
-def _load_labels_from_hf():
-    """Try to load labels.json from HuggingFace model repo"""
-    try:
-        from huggingface_hub import hf_hub_download
-        labels_file = hf_hub_download(repo_id=MODEL_ID, filename="labels.json")
-        with open(labels_file, 'r') as f:
-            labels_data = json.load(f)
-            return labels_data.get("id2label", {})
-    except Exception as e:
-        print(f"[WARNING] Could not load labels.json from HF: {e}")
-        return None
-def _get_label_mapping():
-    """Get complete label mapping with multiple fallback strategies"""
-    global _model
-    if _model is None:
-        return None
-    # Strategy 1: Try model config
-    id2label = getattr(_model.config, "id2label", {}) or {}
-    num_labels = int(getattr(_model.config, "num_labels", 2) or 2)
-    print(f"[DEBUG] Model config id2label: {id2label}")
-    print(f"[DEBUG] Model config num_labels: {num_labels}")
-    # Strategy 2: If incomplete, try labels.json from HuggingFace
-    if len(id2label) < num_labels:
-        print(f"[WARNING] Incomplete id2label in config! Trying labels.json...")
-        hf_labels = _load_labels_from_hf()
-        if hf_labels and len(hf_labels) >= num_labels:
-            id2label = hf_labels
-            print(f"[SUCCESS] Loaded labels from labels.json: {id2label}")
-    # Strategy 3: Convert string keys to int keys
-    complete_mapping = {}
-    for i in range(num_labels):
-        if str(i) in id2label:
-            complete_mapping[i] = id2label[str(i)]
-        elif i in id2label:
-            complete_mapping[i] = id2label[i]
-        else:
-            complete_mapping[i] = f"LABEL_{i}"
-    # Strategy 4: Final fallback if still incomplete
-    if len(complete_mapping) < num_labels or any(v.startswith("LABEL_") for v in complete_mapping.values()):
-        print(f"[WARNING] Using hardcoded fallback mapping!")
-        complete_mapping = {
-            0: "LEGIT",
-            1: "PHISH"
-        }
-    print(f"[FINAL] Applied label mapping: {complete_mapping}")
-    return complete_mapping
 def _normalize_label(txt: str) -> str:
     """Normalize label text"""
     t = (str(txt) if txt is not None else "").strip().upper()
@@ -201,7 +139,7 @@ def _normalize_label(txt: str) -> str:
 def _load_model():
     """Load model, tokenizer, and preprocessor"""
-    global _tokenizer, _model, _device, _preprocessor, _LABEL_MAPPING
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -216,9 +154,6 @@ def _load_model():
         _model.eval()
         _preprocessor = TextPreprocessor()
-        # Get label mapping with fallbacks
-        _LABEL_MAPPING = _get_label_mapping()
         # Warm-up
         with torch.no_grad():
             _ = _model(
@@ -226,7 +161,10 @@ def _load_model():
                 .to(_device)
             ).logits
-        print(f"Model loaded successfully!\n{'='*60}\n")
 def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
@@ -255,34 +193,33 @@ def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List
         logits = _model(**enc).logits
         probs = torch.softmax(logits, dim=-1)
-    num_labels = probs.shape[-1]
     outputs: List[Dict] = []
     for text_idx in range(probs.shape[0]):
         p = probs[text_idx]
-        # Build probability breakdown
-        prob_breakdown = {}
-        for class_idx in range(num_labels):
-            class_label = _LABEL_MAPPING.get(class_idx, f"CLASS_{class_idx}")
-            class_prob = float(p[class_idx].item())
-            prob_breakdown[class_label] = round(class_prob, 4)
         # Get prediction
         predicted_idx = int(torch.argmax(p).item())
-        predicted_label_raw = _LABEL_MAPPING.get(predicted_idx, f"CLASS_{predicted_idx}")
         predicted_label_norm = _normalize_label(predicted_label_raw)
         predicted_prob = float(p[predicted_idx].item())
         output = {
             "text": texts[text_idx][:100] + "..." if len(texts[text_idx]) > 100 else texts[text_idx],
-            "predicted_class_index": predicted_idx,
             "label": predicted_label_norm,
             "raw_label": predicted_label_raw,
             "is_phish": predicted_label_norm == "PHISH",
-            "score": round(predicted_prob, 4),
             "confidence": round(predicted_prob * 100, 2),
-            "probs_by_class": prob_breakdown,
         }
         if include_preprocessing and preprocessing_info:
@@ -305,7 +242,6 @@ def root():
         "status": "ok",
         "model": MODEL_ID,
         "device": _device,
-        "label_mapping": _LABEL_MAPPING,
     }
@@ -314,16 +250,12 @@ def debug_labels():
     """View model configuration"""
     _load_model()
-    id2label_raw = getattr(_model.config, "id2label", {}) or {}
-    label2id_raw = getattr(_model.config, "label2id", {}) or {}
-    num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
     return {
         "status": "ok",
-        "model_config_id2label": id2label_raw,
-        "model_config_label2id": label2id_raw,
-        "model_config_num_labels": num_labels,
-        "applied_mapping": _LABEL_MAPPING,
         "device": _device,
     }

 import os
 from typing import List, Optional, Dict
 import re
 import torch
 import nltk
     nltk.download('stopwords')
     nltk.download('wordnet')
+# ✅ CHANGE THIS TO POINT TO YOUR MODEL REPOSITORY
+MODEL_ID = "Perth0603/phishing-email-mobilebert"  # ← Your model storage repo
 app = FastAPI(title="Phishing Text Classifier with Preprocessing", version="1.0.0")
 _model = None
 _device = "cpu"
 _preprocessor = None
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
 def _normalize_label(txt: str) -> str:
     """Normalize label text"""
     t = (str(txt) if txt is not None else "").strip().upper()
 def _load_model():
     """Load model, tokenizer, and preprocessor"""
+    global _tokenizer, _model, _device, _preprocessor
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
         _model.eval()
         _preprocessor = TextPreprocessor()
         # Warm-up
         with torch.no_grad():
             _ = _model(
                 .to(_device)
             ).logits
+        # Check label mapping
+        id2label = getattr(_model.config, "id2label", {})
+        print(f"Model labels: {id2label}")
+        print(f"{'='*60}\n")
 def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
         logits = _model(**enc).logits
         probs = torch.softmax(logits, dim=-1)
+    # Get labels from model config
+    id2label = getattr(_model.config, "id2label", {0: "LEGIT", 1: "PHISH"})
     outputs: List[Dict] = []
     for text_idx in range(probs.shape[0]):
         p = probs[text_idx]
         # Get prediction
         predicted_idx = int(torch.argmax(p).item())
+        predicted_label_raw = id2label.get(predicted_idx, f"CLASS_{predicted_idx}")
         predicted_label_norm = _normalize_label(predicted_label_raw)
         predicted_prob = float(p[predicted_idx].item())
+        # Build probability breakdown
+        prob_breakdown = {}
+        for i in range(len(p)):
+            label = _normalize_label(id2label.get(i, f"CLASS_{i}"))
+            prob_breakdown[label] = round(float(p[i].item()), 4)
         output = {
             "text": texts[text_idx][:100] + "..." if len(texts[text_idx]) > 100 else texts[text_idx],
             "label": predicted_label_norm,
             "raw_label": predicted_label_raw,
             "is_phish": predicted_label_norm == "PHISH",
             "confidence": round(predicted_prob * 100, 2),
+            "score": round(predicted_prob, 4),
+            "probs": prob_breakdown,
         }
         if include_preprocessing and preprocessing_info:
         "status": "ok",
         "model": MODEL_ID,
         "device": _device,
     }
     """View model configuration"""
     _load_model()
     return {
         "status": "ok",
+        "model_id": MODEL_ID,
+        "id2label": getattr(_model.config, "id2label", {}),
+        "label2id": getattr(_model.config, "label2id", {}),
+        "num_labels": int(getattr(_model.config, "num_labels", 0)),
         "device": _device,
     }