Spaces:
Sleeping
Sleeping
| """HuggingFace Hub model downloader for PhishSentinel. | |
| When running inside a HuggingFace Space (SPACE_ID env var is present), | |
| models are not stored in the git repo. This module downloads them from the | |
| dedicated HF Hub model repository at startup so the Streamlit app can load them. | |
| Usage (called from app.py before any joblib.load calls): | |
| from src.utils.hf_model_loader import ensure_models | |
| ensure_models(MODELS_DIR) | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import os | |
| from pathlib import Path | |
| log = logging.getLogger(__name__) | |
| # ββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HF Hub model repo that holds the trained artefacts. | |
| # Must be public, OR the HF_TOKEN Space secret must grant read access. | |
| HF_MODEL_REPO = "SagarTony90265/PhishSentinel-models" | |
| # All artefact files that must be present for the app to function. | |
| REQUIRED_MODEL_FILES: list[str] = [ | |
| "anomaly_detector.pkl", | |
| "catboost.pkl", | |
| "feature_pipeline.pkl", | |
| "lightgbm.pkl", | |
| "lr.pkl", | |
| "lr_scaler.pkl", | |
| "rf.pkl", | |
| "xgboost.pkl", | |
| ] | |
| def ensure_models(models_dir: Path) -> bool: | |
| """Download missing model artefacts from HF Hub. | |
| Parameters | |
| ---------- | |
| models_dir: | |
| Absolute path to the directory where .pkl files are expected | |
| (``models/models/`` relative to the repo root). | |
| Returns | |
| ------- | |
| bool | |
| ``True`` when all required files are present (either pre-existing | |
| or freshly downloaded). ``False`` on any download failure so the | |
| caller can surface a meaningful error message rather than an obscure | |
| FileNotFoundError. | |
| """ | |
| on_hf_spaces = bool(os.getenv("SPACE_ID")) | |
| missing = [f for f in REQUIRED_MODEL_FILES if not (models_dir / f).exists()] | |
| if not missing: | |
| return True # Nothing to do β all artefacts present locally. | |
| if not on_hf_spaces: | |
| # Local dev environment: models should have been trained locally. | |
| # Don't try to auto-download β let the app show its normal | |
| # "model not found" guidance instead. | |
| log.warning( | |
| "Model files missing locally: %s. " | |
| "Run train.py to generate them, or download from HF Hub manually.", | |
| missing, | |
| ) | |
| return False | |
| # ββ We're on HF Spaces and models are missing β download from Hub ββββββββ | |
| log.info( | |
| "Running on HF Spaces. Downloading %d model file(s) from %s β¦", | |
| len(missing), | |
| HF_MODEL_REPO, | |
| ) | |
| try: | |
| from huggingface_hub import hf_hub_download # type: ignore | |
| except ImportError: | |
| log.error( | |
| "huggingface_hub is not installed β cannot download models. " | |
| "Add `huggingface_hub` to requirements.txt." | |
| ) | |
| return False | |
| models_dir.mkdir(parents=True, exist_ok=True) | |
| token: str | None = os.getenv("HF_TOKEN") or os.getenv("HFTOKEN") or None # None β anonymous (public repos) | |
| failed: list[str] = [] | |
| for filename in missing: | |
| try: | |
| dest = hf_hub_download( | |
| repo_id=HF_MODEL_REPO, | |
| filename=filename, | |
| local_dir=str(models_dir), | |
| token=token, | |
| ) | |
| log.info("Downloaded %s β %s", filename, dest) | |
| except Exception as exc: | |
| log.error("Failed to download %s: %s", filename, exc) | |
| failed.append(filename) | |
| if failed: | |
| log.error("Could not download: %s", failed) | |
| return False | |
| log.info("All model artefacts ready in %s", models_dir) | |
| return True | |