Spaces:

Perth0603
/

Random-Forest-Model-for-PhishingDetection

Sleeping

App Files Files Community

Perth0603 commited on Oct 4, 2025

Commit

bdde6ee

verified ·

1 Parent(s): d506ae1

Upload 7 files

Browse files

Files changed (2) hide show

Dockerfile +8 -0
app.py +244 -558

Dockerfile CHANGED Viewed

@@ -21,6 +21,14 @@ COPY requirements.txt /app/requirements.txt
 RUN pip install -r /app/requirements.txt
 COPY app.py /app/app.py
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 RUN pip install -r /app/requirements.txt
 COPY app.py /app/app.py
+COPY autocalib_phishy.csv /app/autocalib_phishy.csv
+COPY autocalib_legit.csv /app/autocalib_legit.csv
+COPY known_hosts.csv /app/known_hosts.csv
+# Default CSV envs to follow CSVs in image (can be overridden in Space settings)
+ENV AUTOCALIB_PHISHY_CSV=/app/autocalib_phishy.csv \
+    AUTOCALIB_LEGIT_CSV=/app/autocalib_legit.csv \
+    KNOWN_HOSTS_CSV=/app/known_hosts.csv
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,558 +1,244 @@
-import os
-os.environ.setdefault("HOME", "/data")
-os.environ.setdefault("XDG_CACHE_HOME", "/data/.cache")
-os.environ.setdefault("HF_HOME", "/data/.cache")
-os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
-os.environ.setdefault("TORCH_HOME", "/data/.cache")
-from typing import Optional, List, Dict, Any
-import csv
-from urllib.parse import urlparse
-import threading
-import re
-import numpy as np
-import pandas as pd
-import joblib
-import torch
-from fastapi import FastAPI
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from huggingface_hub import hf_hub_download
-try:
-    import xgboost as xgb  # type: ignore
-except Exception:
-    xgb = None
-# -------------------------
-# Environment / config
-# -------------------------
-MODEL_ID = os.environ.get("MODEL_ID", "Perth0603/phishing-email-mobilebert")
-# Support both legacy and HF_* envs
-URL_REPO = os.environ.get("HF_URL_MODEL_ID", os.environ.get("URL_REPO", "Perth0603/Random-Forest-Model-for-PhishingDetection"))
-URL_REPO_TYPE = os.environ.get("HF_URL_REPO_TYPE", os.environ.get("URL_REPO_TYPE", "model"))  # model|space|dataset
-URL_FILENAME = os.environ.get("HF_URL_FILENAME", os.environ.get("URL_FILENAME", "rf_url_phishing_xgboost_bst.joblib"))
-CACHE_DIR = os.environ.get("HF_CACHE_DIR", "/data/.cache")
-os.makedirs(CACHE_DIR, exist_ok=True)
-# Force-thread cap helps tiny Spaces
-torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "1")))
-# Optional manual override (beats everything): "PHISH" or "LEGIT"
-URL_POSITIVE_CLASS_ENV = os.environ.get("URL_POSITIVE_CLASS", "").strip().upper()  # "", "PHISH", "LEGIT"
-app = FastAPI(title="PhishWatch API", version="1.2.0")
-# -------------------------
-# Schemas
-# -------------------------
-class PredictPayload(BaseModel):
-    inputs: str
-class PredictUrlPayload(BaseModel):
-    url: str
-# -------------------------
-# Lazy singletons
-# -------------------------
-_tokenizer: Optional[AutoTokenizer] = None
-_model: Optional[AutoModelForSequenceClassification] = None
-_id2label: Dict[int, str] = {0: "LEGIT", 1: "PHISH"}
-_label2id: Dict[str, int] = {"LEGIT": 0, "PHISH": 1}
-_url_bundle: Optional[Dict[str, Any]] = None
-_model_lock = threading.Lock()
-_url_lock = threading.Lock()
-# Calibrated flag: is XGB class 1 == PHISH?
-_url_phish_is_positive: Optional[bool] = None
-# -------------------------
-# Autocalibration URL prototypes (CSV-driven)
-# -------------------------
-# Provide CSV files for calibration lists to avoid code edits:
-#  - AUTOCALIB_PHISHY_CSV (default hf_space/autocalib_phishy.csv)
-#  - AUTOCALIB_LEGIT_CSV (default hf_space/autocalib_legit.csv)
-# These lists are loaded at startup and before each request (hot-reload safe).
-_AUTOCALIB_PHISHY_URLS: List[str] = []
-_AUTOCALIB_LEGIT_URLS: List[str] = []
-# Known host overrides via CSV (suffix-matched):
-#  - KNOWN_HOSTS_CSV (default hf_space/known_hosts.csv) with columns host,label
-_KNOWN_LEGIT_HOSTS: List[str] = []
-_KNOWN_PHISH_HOSTS: List[str] = []
-# Helpers to normalize and match hosts by suffix (handles subdomains)
-def _normalize_host(value: str) -> str:
-    v = value.strip().lower()
-    if v.startswith("www."):
-        v = v[4:]
-    return v
-def _host_matches_any(host: str, known: List[str]) -> bool:
-    base = _normalize_host(host)
-    for item in known:
-        k = _normalize_host(item)
-        if base == k or base.endswith("." + k):
-            return True
-    return False
-# Optional CSV configuration
-def _read_urls_from_csv(path: str) -> List[str]:
-    urls: List[str] = []
-    try:
-        with open(path, newline="", encoding="utf-8") as f:
-            reader = csv.DictReader(f)
-            if "url" in (reader.fieldnames or []):
-                for row in reader:
-                    val = str(row.get("url", "")).strip()
-                    if val:
-                        urls.append(val)
-            else:
-                f.seek(0)
-                f2 = csv.reader(f)
-                for row in f2:
-                    if not row:
-                        continue
-                    val = str(row[0]).strip()
-                    if val.lower() == "url":
-                        continue
-                    if val:
-                        urls.append(val)
-    except Exception as e:
-        print(f"[csv] failed reading URLs from {path}: {e}")
-    return urls
-def _read_hosts_from_csv(path: str) -> Dict[str, str]:
-    host_to_label: Dict[str, str] = {}
-    try:
-        with open(path, newline="", encoding="utf-8") as f:
-            reader = csv.DictReader(f)
-            fields = [x.lower() for x in (reader.fieldnames or [])]
-            if "host" in fields and "label" in fields:
-                for row in reader:
-                    host = str(row.get("host", "")).strip().lower()
-                    label = str(row.get("label", "")).strip().upper()
-                    if host and label in ("PHISH", "LEGIT"):
-                        host_to_label[host] = label
-            else:
-                f.seek(0)
-                f2 = csv.reader(f)
-                for row in f2:
-                    if len(row) < 2:
-                        continue
-                    host = str(row[0]).strip().lower()
-                    label = str(row[1]).strip().upper()
-                    if host.lower() == "host" and label == "LABEL":
-                        continue
-                    if host and label in ("PHISH", "LEGIT"):
-                        host_to_label[host] = label
-    except Exception as e:
-        print(f"[csv] failed reading hosts from {path}: {e}")
-    return host_to_label
-def _load_csv_configs_if_any():
-    base_dir = os.path.dirname(__file__)
-    phishy_csv = os.environ.get("AUTOCALIB_PHISHY_CSV", os.path.join(base_dir, "autocalib_phishy.csv"))
-    legit_csv = os.environ.get("AUTOCALIB_LEGIT_CSV", os.path.join(base_dir, "autocalib_legit.csv"))
-    hosts_csv = os.environ.get("KNOWN_HOSTS_CSV", os.path.join(base_dir, "known_hosts.csv"))
-    if os.path.exists(phishy_csv):
-        urls = _read_urls_from_csv(phishy_csv)
-        if urls:
-            print(f"[csv] loaded phishy URLs: {len(urls)} from {phishy_csv}")
-            _AUTOCALIB_PHISHY_URLS[:] = urls
-    if os.path.exists(legit_csv):
-        urls = _read_urls_from_csv(legit_csv)
-        if urls:
-            print(f"[csv] loaded legit URLs: {len(urls)} from {legit_csv}")
-            _AUTOCALIB_LEGIT_URLS[:] = urls
-    if os.path.exists(hosts_csv):
-        mapping = _read_hosts_from_csv(hosts_csv)
-        if mapping:
-            print(f"[csv] loaded known hosts: {len(mapping)} from {hosts_csv}")
-            _KNOWN_LEGIT_HOSTS.clear()
-            _KNOWN_PHISH_HOSTS.clear()
-            for host, label in mapping.items():
-                if label == "LEGIT":
-                    _KNOWN_LEGIT_HOSTS.append(host)
-                elif label == "PHISH":
-                    _KNOWN_PHISH_HOSTS.append(host)
-# -------------------------
-# URL features (must match training)
-# -------------------------
-_SUSPICIOUS_TOKENS = ["login", "verify", "secure", "update", "bank", "pay", "account", "webscr"]
-_ipv4_pattern = re.compile(r"(?:\d{1,3}\.){3}\d{1,3}")
-def _engineer_features(df: pd.DataFrame, url_col: str, feature_cols: Optional[List[str]] = None) -> pd.DataFrame:
-    s = df[url_col].astype(str).fillna("")
-    out = pd.DataFrame(index=df.index)
-    out["url_len"] = s.str.len()
-    out["count_dot"] = s.str.count(r"\.")
-    out["count_hyphen"] = s.str.count("-")
-    out["count_digit"] = s.str.count(r"\d")
-    out["count_at"] = s.str.count("@")
-    out["count_qmark"] = s.str.count(r"\?")
-    out["count_eq"] = s.str.count("=")
-    out["count_slash"] = s.str.count("/")
-    out["digit_ratio"] = (out["count_digit"] / out["url_len"].replace(0, np.nan)).fillna(0)
-    out["has_ip"] = s.str.contains(_ipv4_pattern).fillna(False).astype(int)
-    for tok in _SUSPICIOUS_TOKENS:
-        out[f"has_{tok}"] = s.str.contains(tok, case=False, regex=False).fillna(False).astype(int)
-    out["starts_https"] = s.str.startswith("https").astype(int)
-    out["ends_with_exe"] = s.str.endswith(".exe").astype(int)
-    out["ends_with_zip"] = s.str.endswith(".zip").astype(int)
-    return out if not feature_cols else out[feature_cols]
-# -------------------------
-# Loaders
-# -------------------------
-def _load_model():
-    global _tokenizer, _model, _id2label, _label2id, _text_phish_id
-    if _tokenizer is None or _model is None:
-        with _model_lock:
-            if _tokenizer is None or _model is None:
-                _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
-                _model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
-                cfg = getattr(_model, "config", None)
-                if cfg is not None and getattr(cfg, "id2label", None):
-                    _id2label = {int(k): v for k, v in cfg.id2label.items()}
-                    _label2id = {v: int(k) for k, v in _id2label.items()}
-                # Try to detect which index corresponds to PHISH/SPAM
-                _text_phish_id = _detect_text_phish_id(_id2label)
-                with torch.no_grad():
-                    _ = _model(**_tokenizer(["warm up"], return_tensors="pt")).logits
-# Detect which label id corresponds to phishing for text models based on label strings
-_text_phish_id: Optional[int] = None
-def _detect_text_phish_id(id2label: Dict[int, str]) -> Optional[int]:
-    candidates_phish = ("PHISH", "SPAM", "MALICIOUS", "POSITIVE")
-    # Prefer explicit PHISH/SPAM over generic POSITIVE
-    priority_order = ("PHISH", "SPAM", "MALICIOUS", "POSITIVE")
-    norm = {k: str(v).strip().upper() for k, v in id2label.items()}
-    # exact/substring match in priority order
-    for token in priority_order:
-        for k, v in norm.items():
-            if token in v:
-                return int(k)
-    return None
-def _load_url_model():
-    global _url_bundle
-    if _url_bundle is None:
-        with _url_lock:
-            if _url_bundle is None:
-                local_path = os.path.join(os.getcwd(), URL_FILENAME)
-                if os.path.exists(local_path):
-                    _url_bundle = joblib.load(local_path)
-                else:
-                    model_path = hf_hub_download(
-                        repo_id=URL_REPO,
-                        filename=URL_FILENAME,
-                        repo_type=URL_REPO_TYPE,
-                        cache_dir=CACHE_DIR,
-                    )
-                    _url_bundle = joblib.load(model_path)
-def _xgb_predict_class1_prob(booster, feats: pd.DataFrame) -> float:
-    # predicts P(class==1) under binary:logistic objective
-    dmat = xgb.DMatrix(feats)
-    return float(booster.predict(dmat)[0])
-def _auto_calibrate_phish_positive(bundle: Dict[str, Any], feature_cols: List[str], url_col: str) -> bool:
-    """
-    Heuristic: probe with 'obviously phishy' and 'obviously legit' URLs.
-    If mean P(class1) for phishy < legit, then class1 ≈ LEGIT → return False.
-    Otherwise, class1 ≈ PHISH → return True.
-    """
-    # If user forces it via env, honor that first.
-    if URL_POSITIVE_CLASS_ENV in ("PHISH", "LEGIT"):
-        return URL_POSITIVE_CLASS_ENV == "PHISH"
-    # If bundle has explicit flag, use it.
-    if "phish_is_positive" in bundle:
-        return bool(bundle["phish_is_positive"])
-    phishy = _AUTOCALIB_PHISHY_URLS
-    legit = _AUTOCALIB_LEGIT_URLS
-    # Safe fallback if CSVs are missing/empty
-    if not phishy:
-        phishy = [
-            "http://198.51.100.23/login/update?acc=123",
-            "http://secure-login-account-update.example.com/session?id=123",
-            "http://bank.verify-update-security.com/confirm",
-            "http://paypal.com.account-verify.cn/login",
-            "http://abc.xyz/downloads/invoice.exe",
-        ]
-    if not legit:
-        legit = [
-            "https://www.wikipedia.org/",
-            "https://www.microsoft.com/",
-            "https://www.python.org/",
-            "https://www.openai.com/",
-        ]
-    model = bundle.get("model")
-    model_type: str = str(bundle.get("model_type") or "")
-    def _batch_mean(urls: List[str]) -> float:
-        df = pd.DataFrame({url_col: urls})
-        feats = _engineer_features(df, url_col, feature_cols)
-        # XGBoost booster path
-        if model_type == "xgboost_bst" and xgb is not None:
-            try:
-                # Predict row-by-row to be conservative about input formats
-                return float(np.mean([_xgb_predict_class1_prob(model, pd.DataFrame([feats.iloc[i]])) for i in range(len(feats))]))
-            except Exception:
-                pass
-        # scikit-learn-like path with predict_proba
-        if hasattr(model, "predict_proba"):
-            proba = model.predict_proba(feats)
-            classes = bundle.get("classes", getattr(model, "classes_", None))
-            class1_idx = 1
-            if classes is not None:
-                try:
-                    classes_list = list(classes)
-                    if 1 in classes_list:
-                        class1_idx = classes_list.index(1)
-                    else:
-                        class1_idx = 1 if len(classes_list) > 1 else 0
-                except Exception:
-                    class1_idx = 1 if proba.shape[1] > 1 else 0
-            return float(np.mean(proba[:, class1_idx]))
-        # Fallback: use hard predictions and treat label==1 as prob 1
-        try:
-            preds = model.predict(feats)
-            vals: List[float] = []
-            for p in preds:
-                if isinstance(p, (int, float, np.integer, np.floating)):
-                    vals.append(1.0 if int(p) == 1 else 0.0)
-                else:
-                    up = str(p).strip().upper()
-                    vals.append(1.0 if up.startswith("PHISH") or up == "1" else 0.0)
-            return float(np.mean(vals)) if vals else 0.0
-        except Exception:
-            return 0.0
-    try:
-        phishy_mean = _batch_mean(phishy)
-        legit_mean = _batch_mean(legit)
-    except Exception as e:
-        # If anything goes wrong, default to class1=PHISH to mimic common convention
-        print(f"[autocalib] failed: {e}")
-        return True
-    # If phishy scores LOWER than legit for class1, then class1 is likely LEGIT
-    class1_is_phish = phishy_mean > legit_mean
-    print(f"[autocalib] phishy_mean={phishy_mean:.6f} legit_mean={legit_mean:.6f} -> class1_is_phish={class1_is_phish}")
-    return class1_is_phish
-# Optional: pre-load on startup
-@app.on_event("startup")
-def _startup():
-    try:
-        _load_model()
-    except Exception as e:
-        print(f"[startup] text model load failed: {e}")
-    try:
-        _load_url_model()
-        # Load CSV-driven config if present
-        _load_csv_configs_if_any()
-        global _url_phish_is_positive
-        b = _url_bundle
-        if isinstance(b, dict) and _url_phish_is_positive is None:
-            try:
-                feature_cols: List[str] = b.get("feature_cols") or []
-                url_col: str = b.get("url_col") or "url"
-                _url_phish_is_positive = _auto_calibrate_phish_positive(b, feature_cols, url_col)
-            except Exception as ce:
-                print(f"[startup] url model calibration failed: {ce}")
-    except Exception as e:
-        print(f"[startup] url model load failed: {e}")
-# -------------------------
-# Routes
-# -------------------------
-@app.get("/")
-def root():
-    return {"status": "ok", "model": MODEL_ID}
-@app.post("/predict")
-def predict(payload: PredictPayload):
-    try:
-        _load_model()
-        text = (payload.inputs or "").strip()
-        if not text:
-            return JSONResponse(status_code=400, content={"error": "Empty input"})
-        with torch.no_grad():
-            inputs = _tokenizer([text], return_tensors="pt", truncation=True, max_length=512)
-            logits = _model(**inputs).logits
-            probs = torch.softmax(logits, dim=-1)[0]
-            score, idx = torch.max(probs, dim=0)
-            # Normalize label to PHISH/LEGIT if we could detect PHISH id
-            if _text_phish_id is not None and 0 <= _text_phish_id < probs.shape[0]:
-                phish_prob = float(probs[_text_phish_id])
-                norm_label = "PHISH" if phish_prob >= 0.5 else "LEGIT"
-                norm_score = phish_prob if norm_label == "PHISH" else (1.0 - phish_prob)
-                return {"label": norm_label, "score": float(norm_score), "raw_index": int(idx)}
-            else:
-                # Fallback to model's provided labels
-                label = _id2label.get(int(idx), str(int(idx)))
-                return {"label": label, "score": float(score), "raw_index": int(idx)}
-    except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})
-@app.post("/predict-url")
-def predict_url(payload: PredictUrlPayload):
-    try:
-        _load_url_model()
-        # Load CSV-based config if present (hot-reload safe)
-        _load_csv_configs_if_any()
-        bundle = _url_bundle
-        if not isinstance(bundle, dict) or "model" not in bundle:
-            raise RuntimeError("Loaded URL artifact is not a bundle dict with 'model'.")
-        model = bundle["model"]
-        feature_cols: List[str] = bundle.get("feature_cols") or []
-        url_col: str = bundle.get("url_col") or "url"
-        model_type: str = bundle.get("model_type") or ""
-        url_str = (payload.url or "").strip()
-        if not url_str:
-            return JSONResponse(status_code=400, content={"error": "Empty url"})
-        row = pd.DataFrame({url_col: [url_str]})
-        feats = _engineer_features(row, url_col, feature_cols)
-        # ----- compute P(PHISH) -----
-        phish_proba: float = 0.0
-        meta_phish_is_positive: Optional[bool] = bundle.get("phish_is_positive", None)
-        # Resolve polarity precedence: ENV > bundle flag > auto-calibration > default True
-        if URL_POSITIVE_CLASS_ENV in ("PHISH", "LEGIT"):
-            phish_is_positive = (URL_POSITIVE_CLASS_ENV == "PHISH")
-        elif meta_phish_is_positive is not None:
-            phish_is_positive = bool(meta_phish_is_positive)
-        else:
-            global _url_phish_is_positive
-            if _url_phish_is_positive is None:
-                try:
-                    _url_phish_is_positive = _auto_calibrate_phish_positive(bundle, feature_cols, url_col)
-                except Exception as ce:
-                    print(f"[predict-url] auto-calibration failed: {ce}")
-            phish_is_positive = _url_phish_is_positive if _url_phish_is_positive is not None else True
-        backend_debug = {
-            "phish_is_positive_resolved": phish_is_positive,
-            "phish_is_positive_bundle": meta_phish_is_positive,
-            "phish_is_positive_env": URL_POSITIVE_CLASS_ENV if URL_POSITIVE_CLASS_ENV else None,
-        }
-        # Known-domain override after polarity is resolved
-        host = (urlparse(url_str).hostname or "").lower()
-        if host:
-            override_label: Optional[str] = None
-            if _host_matches_any(host, _KNOWN_LEGIT_HOSTS):
-                override_label = "LEGIT"
-            elif _host_matches_any(host, _KNOWN_PHISH_HOSTS):
-                override_label = "PHISH"
-            if override_label is not None:
-                # Map numeric label according to resolved polarity
-                predicted_label_numeric = 1 if ((override_label == "PHISH") == bool(phish_is_positive)) else 0
-                phish_proba_override = 0.99 if override_label == "PHISH" else 0.01
-                score_override = phish_proba_override if override_label == "PHISH" else (1.0 - phish_proba_override)
-                return {
-                    "label": override_label,
-                    "predicted_label": int(predicted_label_numeric),
-                    "score": float(score_override),
-                    "phishing_probability": float(phish_proba_override),
-                    "backend": str(model_type),
-                    "threshold": 0.5,
-                    "override": {
-                        "reason": "known_host",
-                        "host": host,
-                    },
-                    "phish_is_positive": bool(phish_is_positive),
-                    "phish_is_positive_bundle": meta_phish_is_positive,
-                    "phish_is_positive_env": URL_POSITIVE_CLASS_ENV if URL_POSITIVE_CLASS_ENV else None,
-                    "feature_cols": feature_cols,
-                    "url_col": url_col,
-                }
-        raw_p_class1_debug: Optional[float] = None
-        if isinstance(model_type, str) and model_type == "xgboost_bst":
-            if xgb is None:
-                raise RuntimeError("xgboost is not installed but required for this model bundle.")
-            dmat = xgb.DMatrix(feats)
-            raw_p_class1 = float(model.predict(dmat)[0])  # P(class == 1)
-            raw_p_class1_debug = raw_p_class1
-            phish_proba = raw_p_class1 if phish_is_positive else (1.0 - raw_p_class1)
-        elif hasattr(model, "predict_proba"):
-            proba = model.predict_proba(feats)[0]
-            classes = bundle.get("classes", getattr(model, "classes_", None))
-            label_map = bundle.get("label_map")
-            if classes is not None and len(proba) == 2:
-                classes_list = list(classes)
-                phish_idx = None
-                if isinstance(label_map, dict):
-                    for i, c in enumerate(classes_list):
-                        mapped = str(label_map.get(int(c), "")).upper()
-                        if mapped.startswith("PHISH"):
-                            phish_idx = i
-                            break
-                if phish_idx is None:
-                    # fall back to whichever index matches current polarity
-                    # if phish_is_positive → column for class 1, else column for class 0
-                    target_class = 1 if phish_is_positive else 0
-                    if target_class in classes_list:
-                        phish_idx = classes_list.index(target_class)
-                    else:
-                        phish_idx = 1 if phish_is_positive else 0
-                phish_proba = float(proba[phish_idx])
-            else:
-                phish_proba = float(proba[1]) if len(proba) > 1 else float(np.max(proba))
-        else:
-            pred = model.predict(feats)[0]
-            if isinstance(pred, (int, float, np.integer, np.floating)):
-                label_numeric = int(pred)
-                # interpret through polarity
-                if label_numeric in (0, 1):
-                    phish_proba = 1.0 if ((label_numeric == 1) == phish_is_positive) else 0.0
-                else:
-                    phish_proba = float(label_numeric)  # best-effort
-            else:
-                up = str(pred).strip().upper()
-                phish_proba = 1.0 if up.startswith("PHISH") else 0.0
-        phish_proba = float(phish_proba)
-        label = "PHISH" if phish_proba >= 0.5 else "LEGIT"
-        score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
-        # Map to numeric dataset-style label using resolved polarity
-        # If PHISH is the positive (class 1), PHISH -> 1 else 0; if not, invert
-        predicted_label_numeric = 1 if ((label == "PHISH") == bool(phish_is_positive)) else 0
-        return {
-            "label": label,
-            "predicted_label": int(predicted_label_numeric),
-            "score": float(score),
-            "phishing_probability": float(phish_proba),
-            "backend": str(model_type),
-            "threshold": 0.5,
-            # Debug/trace so you can see exactly what was used
-            "phish_is_positive": bool(phish_is_positive),
-            "phish_is_positive_bundle": meta_phish_is_positive,
-            "phish_is_positive_env": URL_POSITIVE_CLASS_ENV if URL_POSITIVE_CLASS_ENV else None,
-            "raw_proba_class1": float(raw_p_class1_debug) if raw_p_class1_debug is not None else None,
-            "feature_cols": feature_cols,
-            "url_col": url_col,
-        }
-    except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})

+import os
+import csv
+import re
+import threading
+from typing import Optional, List, Dict, Any
+import joblib
+import numpy as np
+import pandas as pd
+from fastapi import FastAPI
+from fastapi.responses import JSONResponse
+from huggingface_hub import hf_hub_download
+from pydantic import BaseModel
+from urllib.parse import urlparse
+try:
+    import xgboost as xgb  # type: ignore
+except Exception:
+    xgb = None
+# Environment defaults suitable for HF Spaces
+os.environ.setdefault("HOME", "/data")
+os.environ.setdefault("XDG_CACHE_HOME", "/data/.cache")
+os.environ.setdefault("HF_HOME", "/data/.cache")
+os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
+os.environ.setdefault("TORCH_HOME", "/data/.cache")
+# Config
+URL_REPO = os.environ.get(
+    "HF_URL_MODEL_ID",
+    os.environ.get("URL_REPO", "Perth0603/Random-Forest-Model-for-PhishingDetection"),
+)
+URL_REPO_TYPE = os.environ.get("HF_URL_REPO_TYPE", os.environ.get("URL_REPO_TYPE", "model"))
+URL_FILENAME = os.environ.get("HF_URL_FILENAME", os.environ.get("URL_FILENAME", "rf_url_phishing_xgboost_bst.joblib"))
+CACHE_DIR = os.environ.get("HF_CACHE_DIR", "/data/.cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+# Polarity override: "PHISH" or "LEGIT"; empty means default (class 1 = PHISH)
+URL_POSITIVE_CLASS_ENV = os.environ.get("URL_POSITIVE_CLASS", "").strip().upper()
+# CSV configuration (defaults to files in same directory)
+BASE_DIR = os.path.dirname(__file__)
+AUTOCALIB_PHISHY_CSV = os.environ.get("AUTOCALIB_PHISHY_CSV", os.path.join(BASE_DIR, "autocalib_phishy.csv"))
+AUTOCALIB_LEGIT_CSV = os.environ.get("AUTOCALIB_LEGIT_CSV", os.path.join(BASE_DIR, "autocalib_legit.csv"))
+KNOWN_HOSTS_CSV = os.environ.get("KNOWN_HOSTS_CSV", os.path.join(BASE_DIR, "known_hosts.csv"))
+app = FastAPI(title="PhishWatch URL API", version="2.0.0")
+class PredictUrlPayload(BaseModel):
+    url: str
+_url_bundle: Optional[Dict[str, Any]] = None
+_url_lock = threading.Lock()
+def _normalize_host(value: str) -> str:
+    v = value.strip().lower()
+    if v.startswith("www."):
+        v = v[4:]
+    return v
+def _host_matches_any(host: str, known: List[str]) -> bool:
+    base = _normalize_host(host)
+    for item in known:
+        k = _normalize_host(item)
+        if base == k or base.endswith("." + k):
+            return True
+    return False
+def _read_urls_from_csv(path: str) -> List[str]:
+    urls: List[str] = []
+    try:
+        with open(path, newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            if "url" in (reader.fieldnames or []):
+                for row in reader:
+                    val = str(row.get("url", "")).strip()
+                    if val:
+                        urls.append(val)
+            else:
+                f.seek(0)
+                f2 = csv.reader(f)
+                for row in f2:
+                    if not row:
+                        continue
+                    val = str(row[0]).strip()
+                    if val.lower() == "url":
+                        continue
+                    if val:
+                        urls.append(val)
+    except FileNotFoundError:
+        pass
+    except Exception as e:
+        print(f"[csv] failed reading URLs from {path}: {e}")
+    return urls
+def _read_hosts_from_csv(path: str) -> Dict[str, str]:
+    out: Dict[str, str] = {}
+    try:
+        with open(path, newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            fields = [x.lower() for x in (reader.fieldnames or [])]
+            if "host" in fields and "label" in fields:
+                for row in reader:
+                    host = str(row.get("host", "")).strip()
+                    label = str(row.get("label", "")).strip().upper()
+                    if host and label in ("PHISH", "LEGIT"):
+                        out[host] = label
+    except FileNotFoundError:
+        pass
+    except Exception as e:
+        print(f"[csv] failed reading hosts from {path}: {e}")
+    return out
+def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame:
+    s = pd.Series(urls, dtype=str)
+    out = pd.DataFrame()
+    out["url_len"] = s.str.len().fillna(0)
+    out["count_dot"] = s.str.count(r"\.")
+    out["count_hyphen"] = s.str.count("-")
+    out["count_digit"] = s.str.count(r"\d")
+    out["count_at"] = s.str.count("@")
+    out["count_qmark"] = s.str.count(r"\?")
+    out["count_eq"] = s.str.count("=")
+    out["count_slash"] = s.str.count("/")
+    out["digit_ratio"] = (out["count_digit"] / out["url_len"].replace(0, np.nan)).fillna(0)
+    out["has_ip"] = s.str.contains(r"(?:\d{1,3}\.){3}\d{1,3}").astype(int)
+    for tok in ["login", "verify", "secure", "update", "bank", "pay", "account", "webscr"]:
+        out[f"has_{tok}"] = s.str.contains(tok, case=False, regex=False).astype(int)
+    out["starts_https"] = s.str.startswith("https").astype(int)
+    out["ends_with_exe"] = s.str.endswith(".exe").astype(int)
+    out["ends_with_zip"] = s.str.endswith(".zip").astype(int)
+    return out[feature_cols]
+def _load_url_model():
+    global _url_bundle
+    if _url_bundle is None:
+        with _url_lock:
+            if _url_bundle is None:
+                local_path = os.path.join(os.getcwd(), URL_FILENAME)
+                if os.path.exists(local_path):
+                    _url_bundle = joblib.load(local_path)
+                else:
+                    model_path = hf_hub_download(
+                        repo_id=URL_REPO,
+                        filename=URL_FILENAME,
+                        repo_type=URL_REPO_TYPE,
+                        cache_dir=CACHE_DIR,
+                    )
+                    _url_bundle = joblib.load(model_path)
+@app.get("/")
+def root():
+    return {"status": "ok", "backend": "url-only"}
+@app.post("/predict-url")
+def predict_url(payload: PredictUrlPayload):
+    try:
+        _load_url_model()
+        # Load CSVs on every request (keeps behavior in sync without code edits)
+        phishy_list = _read_urls_from_csv(AUTOCALIB_PHISHY_CSV)
+        legit_list = _read_urls_from_csv(AUTOCALIB_LEGIT_CSV)
+        host_map = _read_hosts_from_csv(KNOWN_HOSTS_CSV)
+        bundle = _url_bundle
+        if not isinstance(bundle, dict) or "model" not in bundle:
+            raise RuntimeError("Loaded URL artifact is not a bundle dict with 'model'.")
+        model = bundle["model"]
+        feature_cols: List[str] = bundle.get("feature_cols") or []
+        url_col: str = bundle.get("url_col") or "url"
+        model_type: str = bundle.get("model_type") or ""
+        url_str = (payload.url or "").strip()
+        if not url_str:
+            return JSONResponse(status_code=400, content={"error": "Empty url"})
+        # Known-host override (suffix match)
+        host = (urlparse(url_str).hostname or "").lower()
+        if host and host_map:
+            for h, lbl in host_map.items():
+                if _host_matches_any(host, [h]):
+                    phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
+                    label = lbl
+                    predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
+                    phish_proba = 0.99 if label == "PHISH" else 0.01
+                    score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
+                    return {
+                        "label": label,
+                        "predicted_label": int(predicted_label),
+                        "score": float(score),
+                        "phishing_probability": float(phish_proba),
+                        "backend": str(model_type),
+                        "threshold": 0.5,
+                        "url_col": url_col,
+                    }
+        # Mirror inference.py exactly for probability of class 1
+        feats = _engineer_features([url_str], feature_cols)
+        if model_type == "xgboost_bst":
+            if xgb is None:
+                raise RuntimeError("xgboost not installed")
+            dmat = xgb.DMatrix(feats)
+            raw_p_class1 = float(model.predict(dmat)[0])
+        elif hasattr(model, "predict_proba"):
+            raw_p_class1 = float(model.predict_proba(feats)[:, 1][0])
+        else:
+            pred = model.predict(feats)[0]
+            raw_p_class1 = 1.0 if int(pred) == 1 else 0.0
+        # Polarity: strictly env or default (class1==PHISH)
+        phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
+        phish_proba = raw_p_class1 if phish_is_positive else (1.0 - raw_p_class1)
+        label = "PHISH" if phish_proba >= 0.5 else "LEGIT"
+        predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
+        score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
+        return {
+            "label": label,
+            "predicted_label": int(predicted_label),
+            "score": float(score),
+            "phishing_probability": float(phish_proba),
+            "backend": str(model_type),
+            "threshold": 0.5,
+            "url_col": url_col,
+        }
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})