PhishSentinel / src /utils /hf_model_loader.py
github-actions[bot]
Deploy to HF Spaces (ci)
0fd143d
"""HuggingFace Hub model downloader for PhishSentinel.
When running inside a HuggingFace Space (SPACE_ID env var is present),
models are not stored in the git repo. This module downloads them from the
dedicated HF Hub model repository at startup so the Streamlit app can load them.
Usage (called from app.py before any joblib.load calls):
from src.utils.hf_model_loader import ensure_models
ensure_models(MODELS_DIR)
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
log = logging.getLogger(__name__)
# ── Configuration ─────────────────────────────────────────────────────────────
# HF Hub model repo that holds the trained artefacts.
# Must be public, OR the HF_TOKEN Space secret must grant read access.
HF_MODEL_REPO = "SagarTony90265/PhishSentinel-models"
# All artefact files that must be present for the app to function.
REQUIRED_MODEL_FILES: list[str] = [
"anomaly_detector.pkl",
"catboost.pkl",
"feature_pipeline.pkl",
"lightgbm.pkl",
"lr.pkl",
"lr_scaler.pkl",
"rf.pkl",
"xgboost.pkl",
]
def ensure_models(models_dir: Path) -> bool:
"""Download missing model artefacts from HF Hub.
Parameters
----------
models_dir:
Absolute path to the directory where .pkl files are expected
(``models/models/`` relative to the repo root).
Returns
-------
bool
``True`` when all required files are present (either pre-existing
or freshly downloaded). ``False`` on any download failure so the
caller can surface a meaningful error message rather than an obscure
FileNotFoundError.
"""
on_hf_spaces = bool(os.getenv("SPACE_ID"))
missing = [f for f in REQUIRED_MODEL_FILES if not (models_dir / f).exists()]
if not missing:
return True # Nothing to do β€” all artefacts present locally.
if not on_hf_spaces:
# Local dev environment: models should have been trained locally.
# Don't try to auto-download β€” let the app show its normal
# "model not found" guidance instead.
log.warning(
"Model files missing locally: %s. "
"Run train.py to generate them, or download from HF Hub manually.",
missing,
)
return False
# ── We're on HF Spaces and models are missing β€” download from Hub ────────
log.info(
"Running on HF Spaces. Downloading %d model file(s) from %s …",
len(missing),
HF_MODEL_REPO,
)
try:
from huggingface_hub import hf_hub_download # type: ignore
except ImportError:
log.error(
"huggingface_hub is not installed β€” cannot download models. "
"Add `huggingface_hub` to requirements.txt."
)
return False
models_dir.mkdir(parents=True, exist_ok=True)
token: str | None = os.getenv("HF_TOKEN") or os.getenv("HFTOKEN") or None # None β†’ anonymous (public repos)
failed: list[str] = []
for filename in missing:
try:
dest = hf_hub_download(
repo_id=HF_MODEL_REPO,
filename=filename,
local_dir=str(models_dir),
token=token,
)
log.info("Downloaded %s β†’ %s", filename, dest)
except Exception as exc:
log.error("Failed to download %s: %s", filename, exc)
failed.append(filename)
if failed:
log.error("Could not download: %s", failed)
return False
log.info("All model artefacts ready in %s", models_dir)
return True