Spaces:

Perth0603
/

phishwatch-proxy

Sleeping

App Files Files Community

Perth0603 commited on Nov 8, 2025

Commit

1c170d1

verified ·

1 Parent(s): 4e51678

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -41

app.py CHANGED Viewed

@@ -132,20 +132,42 @@ _LABEL_MAPPING = None
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
 def _get_label_mapping():
-    """Get complete label mapping from model config"""
     global _model
     if _model is None:
         return None
     id2label = getattr(_model.config, "id2label", {}) or {}
-    num_labels = int(getattr(_model.config, "num_labels", 0) or 0)
-    print(f"[DEBUG] Raw id2label from config: {id2label}")
-    print(f"[DEBUG] num_labels: {num_labels}")
-    # Build complete mapping by index
     complete_mapping = {}
     for i in range(num_labels):
         if str(i) in id2label:
@@ -155,15 +177,15 @@ def _get_label_mapping():
         else:
             complete_mapping[i] = f"LABEL_{i}"
-    # If incomplete, use fallback
-    if len(complete_mapping) < num_labels:
-        print(f"[WARNING] Incomplete mapping! Using fallback.")
         complete_mapping = {
             0: "LEGIT",
             1: "PHISH"
         }
-    print(f"[DEBUG] Complete mapping applied: {complete_mapping}")
     return complete_mapping
@@ -184,8 +206,8 @@ def _load_model():
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"\n{'='*60}")
-        print(f"Loading model on device: {_device}")
-        print(f"Model ID: {MODEL_ID}")
         print(f"{'='*60}\n")
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -194,7 +216,7 @@ def _load_model():
         _model.eval()
         _preprocessor = TextPreprocessor()
-        # Get label mapping
         _LABEL_MAPPING = _get_label_mapping()
         # Warm-up
@@ -204,14 +226,11 @@ def _load_model():
                 .to(_device)
             ).logits
-        print(f"{'='*60}\n")
 def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
-    """
-    Predict with correct label index mapping
-    CRITICAL: probs[i][j] where j is the CLASS INDEX, not probability value
-    """
     _load_model()
     if not texts:
         return []
@@ -237,31 +256,23 @@ def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List
         probs = torch.softmax(logits, dim=-1)
     num_labels = probs.shape[-1]
-    print(f"\n[DEBUG] num_labels from probs shape: {num_labels}")
     outputs: List[Dict] = []
     for text_idx in range(probs.shape[0]):
-        p = probs[text_idx]  # Get probabilities for this text: shape [num_labels]
-        # Create probability breakdown for ALL classes
         prob_breakdown = {}
-        all_probs_list = []
         for class_idx in range(num_labels):
-            class_prob = float(p[class_idx].item())
             class_label = _LABEL_MAPPING.get(class_idx, f"CLASS_{class_idx}")
             prob_breakdown[class_label] = round(class_prob, 4)
-            all_probs_list.append(class_prob)
-            print(f"[DEBUG] Class {class_idx} ({class_label}): {round(class_prob, 4)}")
-        # Get argmax index
         predicted_idx = int(torch.argmax(p).item())
         predicted_label_raw = _LABEL_MAPPING.get(predicted_idx, f"CLASS_{predicted_idx}")
         predicted_label_norm = _normalize_label(predicted_label_raw)
         predicted_prob = float(p[predicted_idx].item())
-        print(f"[DEBUG] ARGMAX: index={predicted_idx}, label={predicted_label_raw}, prob={round(predicted_prob, 4)}")
-        print(f"[DEBUG] Normalized label: {predicted_label_norm}")
         output = {
             "text": texts[text_idx][:100] + "..." if len(texts[text_idx]) > 100 else texts[text_idx],
@@ -272,14 +283,12 @@ def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List
             "score": round(predicted_prob, 4),
             "confidence": round(predicted_prob * 100, 2),
             "probs_by_class": prob_breakdown,
-            "all_probs_raw": [round(p_val, 4) for p_val in all_probs_list],
         }
         if include_preprocessing and preprocessing_info:
             output["preprocessing"] = preprocessing_info[text_idx]
         outputs.append(output)
-        print(f"\n")
     return outputs
@@ -302,7 +311,7 @@ def root():
 @app.get("/debug/labels")
 def debug_labels():
-    """View complete model configuration"""
     _load_model()
     id2label_raw = getattr(_model.config, "id2label", {}) or {}
@@ -316,7 +325,6 @@ def debug_labels():
         "model_config_num_labels": num_labels,
         "applied_mapping": _LABEL_MAPPING,
         "device": _device,
-        "note": "applied_mapping is what gets used for predictions"
     }
@@ -326,12 +334,9 @@ def debug_preprocessing(payload: PredictPayload):
     try:
         _load_model()
         preprocessing = _preprocessor.preprocess(payload.inputs)
-        return {
-            "status": "ok",
-            "preprocessing": preprocessing
-        }
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error: {e}")
 @app.post("/predict")
@@ -341,7 +346,7 @@ def predict(payload: PredictPayload):
         res = _predict_texts([payload.inputs], include_preprocessing=payload.include_preprocessing)
         return res[0]
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error: {e}")
 @app.post("/predict-batch")
@@ -350,7 +355,7 @@ def predict_batch(payload: BatchPredictPayload):
     try:
         return _predict_texts(payload.inputs, include_preprocessing=payload.include_preprocessing)
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error: {e}")
 @app.post("/evaluate")
@@ -385,7 +390,7 @@ def evaluate(payload: EvalPayload):
             "per_class": per_class,
         }
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error: {e}")
 if __name__ == "__main__":

 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
+def _load_labels_from_hf():
+    """Try to load labels.json from HuggingFace model repo"""
+    try:
+        from huggingface_hub import hf_hub_download
+        labels_file = hf_hub_download(repo_id=MODEL_ID, filename="labels.json")
+        with open(labels_file, 'r') as f:
+            labels_data = json.load(f)
+            return labels_data.get("id2label", {})
+    except Exception as e:
+        print(f"[WARNING] Could not load labels.json from HF: {e}")
+        return None
 def _get_label_mapping():
+    """Get complete label mapping with multiple fallback strategies"""
     global _model
     if _model is None:
         return None
+    # Strategy 1: Try model config
     id2label = getattr(_model.config, "id2label", {}) or {}
+    num_labels = int(getattr(_model.config, "num_labels", 2) or 2)
+    print(f"[DEBUG] Model config id2label: {id2label}")
+    print(f"[DEBUG] Model config num_labels: {num_labels}")
+    # Strategy 2: If incomplete, try labels.json from HuggingFace
+    if len(id2label) < num_labels:
+        print(f"[WARNING] Incomplete id2label in config! Trying labels.json...")
+        hf_labels = _load_labels_from_hf()
+        if hf_labels and len(hf_labels) >= num_labels:
+            id2label = hf_labels
+            print(f"[SUCCESS] Loaded labels from labels.json: {id2label}")
+    # Strategy 3: Convert string keys to int keys
     complete_mapping = {}
     for i in range(num_labels):
         if str(i) in id2label:
         else:
             complete_mapping[i] = f"LABEL_{i}"
+    # Strategy 4: Final fallback if still incomplete
+    if len(complete_mapping) < num_labels or any(v.startswith("LABEL_") for v in complete_mapping.values()):
+        print(f"[WARNING] Using hardcoded fallback mapping!")
         complete_mapping = {
             0: "LEGIT",
             1: "PHISH"
         }
+    print(f"[FINAL] Applied label mapping: {complete_mapping}")
     return complete_mapping
     if _tokenizer is None or _model is None:
         _device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"\n{'='*60}")
+        print(f"Loading model: {MODEL_ID}")
+        print(f"Device: {_device}")
         print(f"{'='*60}\n")
         _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         _model.eval()
         _preprocessor = TextPreprocessor()
+        # Get label mapping with fallbacks
         _LABEL_MAPPING = _get_label_mapping()
         # Warm-up
                 .to(_device)
             ).logits
+        print(f"Model loaded successfully!\n{'='*60}\n")
 def _predict_texts(texts: List[str], include_preprocessing: bool = True) -> List[Dict]:
+    """Predict with correct label mapping"""
     _load_model()
     if not texts:
         return []
         probs = torch.softmax(logits, dim=-1)
     num_labels = probs.shape[-1]
     outputs: List[Dict] = []
     for text_idx in range(probs.shape[0]):
+        p = probs[text_idx]
+        # Build probability breakdown
         prob_breakdown = {}
         for class_idx in range(num_labels):
             class_label = _LABEL_MAPPING.get(class_idx, f"CLASS_{class_idx}")
+            class_prob = float(p[class_idx].item())
             prob_breakdown[class_label] = round(class_prob, 4)
+        # Get prediction
         predicted_idx = int(torch.argmax(p).item())
         predicted_label_raw = _LABEL_MAPPING.get(predicted_idx, f"CLASS_{predicted_idx}")
         predicted_label_norm = _normalize_label(predicted_label_raw)
         predicted_prob = float(p[predicted_idx].item())
         output = {
             "text": texts[text_idx][:100] + "..." if len(texts[text_idx]) > 100 else texts[text_idx],
             "score": round(predicted_prob, 4),
             "confidence": round(predicted_prob * 100, 2),
             "probs_by_class": prob_breakdown,
         }
         if include_preprocessing and preprocessing_info:
             output["preprocessing"] = preprocessing_info[text_idx]
         outputs.append(output)
     return outputs
 @app.get("/debug/labels")
 def debug_labels():
+    """View model configuration"""
     _load_model()
     id2label_raw = getattr(_model.config, "id2label", {}) or {}
         "model_config_num_labels": num_labels,
         "applied_mapping": _LABEL_MAPPING,
         "device": _device,
     }
     try:
         _load_model()
         preprocessing = _preprocessor.preprocess(payload.inputs)
+        return preprocessing
     except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/predict")
         res = _predict_texts([payload.inputs], include_preprocessing=payload.include_preprocessing)
         return res[0]
     except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/predict-batch")
     try:
         return _predict_texts(payload.inputs, include_preprocessing=payload.include_preprocessing)
     except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/evaluate")
             "per_class": per_class,
         }
     except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":