BioRAG / startup.py
aseelflihan's picture
Pre-build models in Docker for fast startup
cfc2b8e
"""Pre-download all models and build index during Docker build."""
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
print("=== PRE-DOWNLOADING MODELS ===")
# 1. Download embedding model
print("[1/3] Downloading embedding model...")
from sentence_transformers import SentenceTransformer
SentenceTransformer("NeuML/pubmedbert-base-embeddings")
print(" Done.")
# 2. Download NLI model
print("[2/3] Downloading NLI model...")
from transformers import AutoTokenizer, AutoModelForSequenceClassification
AutoTokenizer.from_pretrained("pritamdeka/PubMedBERT-MNLI-MedNLI")
AutoModelForSequenceClassification.from_pretrained("pritamdeka/PubMedBERT-MNLI-MedNLI")
print(" Done.")
# 3. Build FAISS index
print("[3/3] Building FAISS index...")
from src.bio_rag.config import BioRAGConfig
from src.bio_rag.data_loader import load_diabetes_pubmedqa
from src.bio_rag.knowledge_base import KnowledgeBaseBuilder
config = BioRAGConfig()
samples = load_diabetes_pubmedqa(config.dataset_name, max_samples=config.max_samples)
print(f" Filtered {len(samples)} diabetes samples.")
kb = KnowledgeBaseBuilder(config)
vs = kb.load_or_build(samples)
print(" FAISS index built and saved.")
print("=== ALL MODELS READY ===")