| 0
|
| """
|
| Model Setup Script.
|
|
|
| Downloads and caches all required ML models:
|
| - IndicBERT (scam detection)
|
| - spaCy (NER)
|
| - Sentence Transformers (embeddings)
|
|
|
| Run this script after installing dependencies to pre-download models.
|
| """
|
|
|
| import sys
|
| import os
|
| import time
|
| import subprocess
|
| from typing import Tuple, Optional
|
|
|
|
|
| def download_indicbert(token: Optional[str] = None) -> Tuple[bool, Optional[float]]:
|
| """
|
| Download and cache IndicBERT model.
|
|
|
| Args:
|
| token: HuggingFace token for accessing gated model.
|
| If None, will try to get from HUGGINGFACE_TOKEN env var.
|
|
|
| Returns:
|
| Tuple of (success, load_time_seconds)
|
| """
|
| print("Downloading IndicBERT...")
|
| start_time = time.time()
|
|
|
|
|
| if token is None:
|
| token = os.getenv("HUGGINGFACE_TOKEN")
|
|
|
| try:
|
| from transformers import AutoModel, AutoTokenizer
|
|
|
| model_name = "ai4bharat/indic-bert"
|
|
|
|
|
| token_kwargs = {}
|
| if token:
|
| token_kwargs["token"] = token
|
| print(" Using HuggingFace token for authentication...")
|
|
|
| print(" Loading tokenizer...")
|
| tokenizer = AutoTokenizer.from_pretrained(model_name, **token_kwargs)
|
|
|
| print(" Loading model (this may take a few minutes on first run)...")
|
| model = AutoModel.from_pretrained(model_name, **token_kwargs)
|
|
|
| load_time = time.time() - start_time
|
| print(f" IndicBERT ready (loaded in {load_time:.2f}s)")
|
|
|
|
|
| test_input = tokenizer("Test message", return_tensors="pt", truncation=True, max_length=512)
|
| model.eval()
|
| _ = model(**test_input)
|
|
|
| return True, load_time
|
| except ImportError as e:
|
| print(f" [ERROR] transformers not installed: {e}")
|
| print(" Run: pip install transformers")
|
| return False, None
|
| except Exception as e:
|
| error_msg = str(e)
|
| if "gated repo" in error_msg.lower() or "access" in error_msg.lower():
|
| print(f" [ERROR] IndicBERT requires HuggingFace authentication")
|
| print(" This model is gated. To access it:")
|
| print(" 1. Request access at: https://huggingface.co/ai4bharat/indic-bert")
|
| print(" 2. Get your token from: https://huggingface.co/settings/tokens")
|
| print(" 3. Set environment variable: HUGGINGFACE_TOKEN=your_token_here")
|
| print(" 4. Or login with: huggingface-cli login")
|
| print(f" [INFO] Current token status: {'Provided' if token else 'Not provided'}")
|
| else:
|
| print(f" [ERROR] IndicBERT download failed: {e}")
|
| return False, None
|
|
|
|
|
| def download_spacy() -> Tuple[bool, Optional[float]]:
|
| """
|
| Download and install spaCy model.
|
|
|
| Returns:
|
| Tuple of (success, load_time_seconds)
|
| """
|
| print("Downloading spaCy model...")
|
|
|
| try:
|
| import spacy
|
|
|
|
|
| try:
|
| start_time = time.time()
|
| nlp = spacy.load("en_core_web_sm")
|
| load_time = time.time() - start_time
|
| print(f" spaCy model already installed (loads in {load_time:.2f}s)")
|
| return True, load_time
|
| except OSError:
|
|
|
| print(" Model not found, downloading...")
|
| subprocess.run(
|
| [sys.executable, "-m", "spacy", "download", "en_core_web_sm"],
|
| check=True,
|
| capture_output=True,
|
| )
|
| print(" Model downloaded, testing load...")
|
|
|
| start_time = time.time()
|
| nlp = spacy.load("en_core_web_sm")
|
| load_time = time.time() - start_time
|
|
|
|
|
| doc = nlp("Test message")
|
| assert len(doc) > 0
|
|
|
| print(f" spaCy ready (loads in {load_time:.2f}s)")
|
| return True, load_time
|
|
|
| except subprocess.CalledProcessError as e:
|
| print(f" [ERROR] spaCy download failed: {e}")
|
| print(" Run manually: python -m spacy download en_core_web_sm")
|
| return False, None
|
| except ImportError as e:
|
| print(f" [ERROR] spacy not installed: {e}")
|
| print(" Run: pip install spacy")
|
| return False, None
|
| except Exception as e:
|
| print(f" [ERROR] spaCy setup failed: {e}")
|
| return False, None
|
|
|
|
|
| def download_sentence_transformers() -> Tuple[bool, Optional[float]]:
|
| """
|
| Download and cache sentence-transformers model.
|
|
|
| Returns:
|
| Tuple of (success, load_time_seconds)
|
| """
|
| print("Downloading sentence-transformers...")
|
| start_time = time.time()
|
|
|
| try:
|
| from sentence_transformers import SentenceTransformer
|
|
|
| print(" Loading model (this may take a few minutes on first run)...")
|
| embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
| load_time = time.time() - start_time
|
| print(f" Embeddings model ready (loaded in {load_time:.2f}s)")
|
|
|
|
|
| test_embedding = embedder.encode("Test message")
|
| assert len(test_embedding) > 0
|
|
|
| return True, load_time
|
| except ImportError as e:
|
| print(f" [ERROR] sentence-transformers not installed: {e}")
|
| print(" Run: pip install sentence-transformers")
|
| return False, None
|
| except Exception as e:
|
| print(f" [ERROR] Sentence transformers download failed: {e}")
|
| return False, None
|
|
|
|
|
| def verify_models() -> Tuple[bool, dict]:
|
| """
|
| Verify all models are loaded correctly and test loading times.
|
|
|
| Returns:
|
| Tuple of (all_verified, load_times_dict)
|
| """
|
| print("\nVerifying models...")
|
| load_times = {}
|
| all_verified = True
|
|
|
|
|
| print(" Testing IndicBERT load time...")
|
| try:
|
| from transformers import AutoModel, AutoTokenizer
|
|
|
|
|
| token = os.getenv("HUGGINGFACE_TOKEN")
|
| token_kwargs = {"token": token} if token else {}
|
|
|
| start = time.time()
|
| tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", **token_kwargs)
|
| model = AutoModel.from_pretrained("ai4bharat/indic-bert", **token_kwargs)
|
| load_time = time.time() - start
|
| load_times['indicbert'] = load_time
|
|
|
| if load_time < 10.0:
|
| print(f" [OK] IndicBERT loads in {load_time:.2f}s (< 10s target)")
|
| else:
|
| print(f" [WARN] IndicBERT loads in {load_time:.2f}s (exceeds 10s target)")
|
| all_verified = False
|
| except Exception as e:
|
| error_msg = str(e)
|
| if "gated repo" in error_msg.lower() or "access" in error_msg.lower():
|
| print(f" [SKIP] IndicBERT requires HuggingFace authentication")
|
| print(" [INFO] Set HUGGINGFACE_TOKEN environment variable")
|
| print(" [INFO] Request access at: https://huggingface.co/ai4bharat/indic-bert")
|
| else:
|
| print(f" [FAIL] IndicBERT verification failed: {e}")
|
| all_verified = False
|
|
|
|
|
| print(" Testing spaCy load time...")
|
| try:
|
| import spacy
|
| start = time.time()
|
| nlp = spacy.load("en_core_web_sm")
|
| load_time = time.time() - start
|
| load_times['spacy'] = load_time
|
|
|
| if load_time < 5.0:
|
| print(f" [OK] spaCy loads in {load_time:.2f}s (< 5s target)")
|
| else:
|
| print(f" [WARN] spaCy loads in {load_time:.2f}s (exceeds 5s target)")
|
| all_verified = False
|
| except Exception as e:
|
| print(f" [FAIL] spaCy verification failed: {e}")
|
| all_verified = False
|
|
|
|
|
| print(" Testing sentence-transformers load time...")
|
| try:
|
| from sentence_transformers import SentenceTransformer
|
| start = time.time()
|
| embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
| load_time = time.time() - start
|
| load_times['sentence_transformers'] = load_time
|
| print(f" [OK] Sentence transformers loads in {load_time:.2f}s")
|
| except Exception as e:
|
| print(f" [FAIL] Sentence transformers verification failed: {e}")
|
| all_verified = False
|
|
|
| return all_verified, load_times
|
|
|
|
|
| def main():
|
| """Main entry point for model setup."""
|
| print("=" * 60)
|
| print("ScamShield AI - Model Setup")
|
| print("=" * 60)
|
| print()
|
|
|
|
|
| token = os.getenv("HUGGINGFACE_TOKEN")
|
| if token:
|
| print(f"[INFO] HuggingFace token found: {token[:10]}...")
|
| else:
|
| print("[INFO] HUGGINGFACE_TOKEN not set. IndicBERT may require authentication.")
|
| print()
|
|
|
| results = {}
|
|
|
|
|
| indicbert_success, indicbert_time = download_indicbert(token=token)
|
| results['indicbert'] = (indicbert_success, indicbert_time)
|
| print()
|
|
|
| spacy_success, spacy_time = download_spacy()
|
| results['spacy'] = (spacy_success, spacy_time)
|
| print()
|
|
|
| st_success, st_time = download_sentence_transformers()
|
| results['sentence_transformers'] = (st_success, st_time)
|
| print()
|
|
|
|
|
| all_verified, load_times = verify_models()
|
|
|
|
|
| print("\n" + "=" * 60)
|
| print("Summary")
|
| print("=" * 60)
|
|
|
| all_success = all(success for success, _ in results.values())
|
|
|
| for model_name, (success, load_time) in results.items():
|
| status = "[OK]" if success else "[FAIL]"
|
| time_str = f"{load_time:.2f}s" if load_time else "N/A"
|
| print(f" {status} {model_name}: {time_str}")
|
|
|
| if all_success and all_verified:
|
| print("\n[SUCCESS] All models downloaded and cached successfully!")
|
| print("=" * 60)
|
| return 0
|
| else:
|
| print("\n[ERROR] Some models failed to download or verify")
|
| print("=" * 60)
|
| return 1
|
|
|
|
|
| if __name__ == "__main__":
|
| exit_code = main()
|
| sys.exit(exit_code)
|
|
|