Spaces:

Perth0603
/

Random-Forest-Model-for-PhishingDetection

Sleeping

App Files Files Community

Perth0603 commited on Oct 4, 2025

Commit

54fa158

verified ·

1 Parent(s): 6a642c0

Upload app.py

Browse files

Files changed (1) hide show

app.py +52 -13

app.py CHANGED Viewed

@@ -27,9 +27,10 @@ except Exception:
 # Environment / config
 # -------------------------
 MODEL_ID = os.environ.get("MODEL_ID", "Perth0603/phishing-email-mobilebert")
-URL_REPO = os.environ.get("URL_REPO", "Perth0603/Random-Forest-Model-for-PhishingDetection")
-URL_REPO_TYPE = os.environ.get("URL_REPO_TYPE", "model")  # model|space|dataset
-URL_FILENAME = os.environ.get("URL_FILENAME", "rf_url_phishing_xgboost_bst.joblib")
 CACHE_DIR = os.environ.get("HF_CACHE_DIR", "/data/.cache")
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -185,10 +186,47 @@ def _auto_calibrate_phish_positive(bundle: Dict[str, Any], feature_cols: List[st
     phishy = _AUTOCALIB_PHISHY_URLS
     legit = _AUTOCALIB_LEGIT_URLS
     def _batch_mean(urls: List[str]) -> float:
         df = pd.DataFrame({url_col: urls})
-        f = _engineer_features(df, url_col, feature_cols)
-        return float(np.mean([_xgb_predict_class1_prob(bundle["model"], pd.DataFrame([f.iloc[i]])) for i in range(len(f))]))
     try:
         phishy_mean = _batch_mean(phishy)
@@ -212,16 +250,15 @@ def _startup():
         print(f"[startup] text model load failed: {e}")
     try:
         _load_url_model()
-        # Calibrate for XGB if needed
         global _url_phish_is_positive
         b = _url_bundle
-        if isinstance(b, dict) and b.get("model_type") == "xgboost_bst" and _url_phish_is_positive is None:
-            if xgb is None:
-                print("[startup] xgboost not installed; cannot calibrate URL model.")
-            else:
                 feature_cols: List[str] = b.get("feature_cols") or []
                 url_col: str = b.get("url_col") or "url"
                 _url_phish_is_positive = _auto_calibrate_phish_positive(b, feature_cols, url_col)
     except Exception as e:
         print(f"[startup] url model load failed: {e}")
@@ -288,10 +325,12 @@ def predict_url(payload: PredictUrlPayload):
         elif meta_phish_is_positive is not None:
             phish_is_positive = bool(meta_phish_is_positive)
         else:
-            # If not yet calibrated, do it now for xgb
             global _url_phish_is_positive
-            if _url_phish_is_positive is None and model_type == "xgboost_bst" and xgb is not None:
-                _url_phish_is_positive = _auto_calibrate_phish_positive(bundle, feature_cols, url_col)
             phish_is_positive = _url_phish_is_positive if _url_phish_is_positive is not None else True
         backend_debug = {

 # Environment / config
 # -------------------------
 MODEL_ID = os.environ.get("MODEL_ID", "Perth0603/phishing-email-mobilebert")
+# Support both legacy and HF_* envs
+URL_REPO = os.environ.get("HF_URL_MODEL_ID", os.environ.get("URL_REPO", "Perth0603/Random-Forest-Model-for-PhishingDetection"))
+URL_REPO_TYPE = os.environ.get("HF_URL_REPO_TYPE", os.environ.get("URL_REPO_TYPE", "model"))  # model|space|dataset
+URL_FILENAME = os.environ.get("HF_URL_FILENAME", os.environ.get("URL_FILENAME", "rf_url_phishing_xgboost_bst.joblib"))
 CACHE_DIR = os.environ.get("HF_CACHE_DIR", "/data/.cache")
 os.makedirs(CACHE_DIR, exist_ok=True)
     phishy = _AUTOCALIB_PHISHY_URLS
     legit = _AUTOCALIB_LEGIT_URLS
+    model = bundle.get("model")
+    model_type: str = str(bundle.get("model_type") or "")
     def _batch_mean(urls: List[str]) -> float:
         df = pd.DataFrame({url_col: urls})
+        feats = _engineer_features(df, url_col, feature_cols)
+        # XGBoost booster path
+        if model_type == "xgboost_bst" and xgb is not None:
+            try:
+                # Predict row-by-row to be conservative about input formats
+                return float(np.mean([_xgb_predict_class1_prob(model, pd.DataFrame([feats.iloc[i]])) for i in range(len(feats))]))
+            except Exception:
+                pass
+        # scikit-learn-like path with predict_proba
+        if hasattr(model, "predict_proba"):
+            proba = model.predict_proba(feats)
+            classes = bundle.get("classes", getattr(model, "classes_", None))
+            class1_idx = 1
+            if classes is not None:
+                try:
+                    classes_list = list(classes)
+                    if 1 in classes_list:
+                        class1_idx = classes_list.index(1)
+                    else:
+                        class1_idx = 1 if len(classes_list) > 1 else 0
+                except Exception:
+                    class1_idx = 1 if proba.shape[1] > 1 else 0
+            return float(np.mean(proba[:, class1_idx]))
+        # Fallback: use hard predictions and treat label==1 as prob 1
+        try:
+            preds = model.predict(feats)
+            vals: List[float] = []
+            for p in preds:
+                if isinstance(p, (int, float, np.integer, np.floating)):
+                    vals.append(1.0 if int(p) == 1 else 0.0)
+                else:
+                    up = str(p).strip().upper()
+                    vals.append(1.0 if up.startswith("PHISH") or up == "1" else 0.0)
+            return float(np.mean(vals)) if vals else 0.0
+        except Exception:
+            return 0.0
     try:
         phishy_mean = _batch_mean(phishy)
         print(f"[startup] text model load failed: {e}")
     try:
         _load_url_model()
         global _url_phish_is_positive
         b = _url_bundle
+        if isinstance(b, dict) and _url_phish_is_positive is None:
+            try:
                 feature_cols: List[str] = b.get("feature_cols") or []
                 url_col: str = b.get("url_col") or "url"
                 _url_phish_is_positive = _auto_calibrate_phish_positive(b, feature_cols, url_col)
+            except Exception as ce:
+                print(f"[startup] url model calibration failed: {ce}")
     except Exception as e:
         print(f"[startup] url model load failed: {e}")
         elif meta_phish_is_positive is not None:
             phish_is_positive = bool(meta_phish_is_positive)
         else:
             global _url_phish_is_positive
+            if _url_phish_is_positive is None:
+                try:
+                    _url_phish_is_positive = _auto_calibrate_phish_positive(bundle, feature_cols, url_col)
+                except Exception as ce:
+                    print(f"[predict-url] auto-calibration failed: {ce}")
             phish_is_positive = _url_phish_is_positive if _url_phish_is_positive is not None else True
         backend_debug = {