#!/usr/bin/env python3 """ Script to download the RoBERTa phishing content detector model from Hugging Face """ import os import logging from pathlib import Path from transformers import AutoTokenizer, AutoModelForSequenceClassification from huggingface_hub import snapshot_download # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def download_model(): """Download the RoBERTa phishing detector model""" # Model configuration model_name = "songhieng/roberta-phishing-content-detector-5.0" local_model_path = "models/roberta-phishing-detector" # Create models directory Path(local_model_path).mkdir(parents=True, exist_ok=True) try: logger.info(f"Downloading model: {model_name}") # Download the model files using snapshot_download snapshot_download( repo_id=model_name, local_dir=local_model_path, local_dir_use_symlinks=False, ) logger.info("Model downloaded successfully using snapshot_download") # Verify the model can be loaded logger.info("Verifying model loading...") tokenizer = AutoTokenizer.from_pretrained(local_model_path, local_files_only=True) model = AutoModelForSequenceClassification.from_pretrained(local_model_path, local_files_only=True) logger.info("Model verification successful!") logger.info(f"Model saved to: {os.path.abspath(local_model_path)}") # Print model info logger.info(f"Model config: {model.config}") logger.info(f"Tokenizer vocab size: {len(tokenizer)}") return True except Exception as e: logger.error(f"Failed to download model: {str(e)}") return False if __name__ == "__main__": success = download_model() if not success: exit(1) logger.info("Model download completed successfully!")