Algopharma_backend / setup_models.py
organic266-star
Add AlgoPharma FastAPI backend and Docker deployment
b34e73d
"""
AlgoPharma β€” Model Setup Script
================================
Run this ONCE after `uv sync` to download all required models to disk.
uv run python setup_models.py
What it does:
1. Downloads the spaCy English pipeline (en_core_web_sm - 12 MB)
2. Pre-caches all HuggingFace models used at runtime:
- Drug NER : OpenMed-NER-PharmaDetect-ModernClinical-149M
- Disease NER: OpenMed-NER-DiseaseDetect-SuperClinical-184M
- Sentiment : cardiffnlp/twitter-roberta-base-sentiment-latest
- PII (en) : OpenMed-PII-SuperClinical-Small-44M-v1
- PII (hi) : OpenMed-PII-Hindi-SuperClinical-Small-44M-v1
- PII (te) : OpenMed-PII-Telugu-FastClinical-Small-82M-v1
All models are saved to the default HuggingFace cache
(~/.cache/huggingface/hub) so Celery workers pick them up instantly.
"""
import subprocess
import sys
import logging
# Ensure UTF-8 output on Windows (cp1252 can't handle ─ and βœ…)
if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8":
sys.stdout.reconfigure(encoding="utf-8")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger("setup_models")
DIVIDER = "─" * 60
# ── 1. spaCy model ────────────────────────────────────────────
# en_core_web_sm β†’ 12 MB (tok2vec + tagger + parser + NER)
# This is already the smallest production-grade English pipeline.
# en_core_web_md β†’ 43 MB (adds word vectors)
# en_core_web_lg β†’ 741 MB (larger vectors) ← do NOT use
SPACY_MODEL = "en_core_web_sm"
# ── 2. HuggingFace models ─────────────────────────────────────
HF_MODELS = [
# (repo_id, description, pipeline_task)
(
"OpenMed/OpenMed-NER-PharmaDetect-ModernClinical-149M",
"Drug NER – 149M params, CPU-friendly",
"token-classification",
),
(
"OpenMed/OpenMed-NER-DiseaseDetect-SuperClinical-184M",
"Disease NER – 184M params, CPU-friendly",
"token-classification",
),
(
"cardiffnlp/twitter-roberta-base-sentiment-latest",
"Sentiment – 125M RoBERTa, medical-social-media fine-tuned",
"sentiment-analysis",
),
(
"OpenMed/OpenMed-PII-SuperClinical-Small-44M-v1",
"PII (English) – 44M DeBERTa, clinical de-identification",
"token-classification",
),
(
"OpenMed/OpenMed-PII-Hindi-SuperClinical-Small-44M-v1",
"PII (Hindi) – 44M, Hindi clinical de-identification",
"token-classification",
),
(
"OpenMed/OpenMed-PII-Telugu-FastClinical-Small-82M-v1",
"PII (Telugu) – 82M, Telugu clinical de-identification",
"token-classification",
),
]
def _step(n: int, total: int, msg: str) -> None:
logger.info(f"[{n}/{total}] {msg}")
def download_spacy(model: str) -> bool:
"""Download a spaCy model via subprocess (handles the pip-install step)."""
logger.info(DIVIDER)
logger.info(f"Downloading spaCy model: {model}")
result = subprocess.run(
[sys.executable, "-m", "spacy", "download", model],
capture_output=False,
)
if result.returncode == 0:
logger.info(f"βœ… spaCy {model} ready")
return True
else:
logger.error(f"❌ spaCy download failed for {model}")
return False
def download_hf_model(repo_id: str, description: str, task: str) -> bool:
"""Warm the HuggingFace cache for a model by instantiating its pipeline."""
logger.info(DIVIDER)
logger.info(f"Downloading: {repo_id}")
logger.info(f" β†’ {description}")
try:
from transformers import pipeline as hf_pipeline
# force_download=False so re-runs are instant cache hits
pipe = hf_pipeline(
task,
model=repo_id,
device=-1, # CPU only β€” no CUDA dependency
)
# Quick smoke-test to confirm model is functional
if task == "token-classification":
pipe("AlgoPharma setup test")
else:
pipe("This is a test")
del pipe
logger.info(f"βœ… {repo_id} cached & verified")
return True
except Exception as e:
logger.error(f"❌ Failed to download {repo_id}: {e}")
return False
def main() -> None:
total_steps = 1 + len(HF_MODELS)
failures: list[str] = []
print()
print("=" * 60)
print(" AlgoPharma β€” Model Setup")
print(" Run once after: uv sync")
print("=" * 60)
print()
# Step 1: spaCy
_step(1, total_steps, f"spaCy β€” {SPACY_MODEL}")
if not download_spacy(SPACY_MODEL):
failures.append(SPACY_MODEL)
# Steps 2+: HuggingFace models
for i, (repo_id, desc, task) in enumerate(HF_MODELS, start=2):
_step(i, total_steps, repo_id)
if not download_hf_model(repo_id, desc, task):
failures.append(repo_id)
print()
print(DIVIDER)
if failures:
logger.warning(f"⚠️ {len(failures)} model(s) failed to download:")
for f in failures:
logger.warning(f" β€’ {f}")
logger.warning("Re-run this script or download manually.")
else:
logger.info(f"βœ… All {total_steps} models downloaded and cached.")
logger.info("You can now start the MCP server and Celery worker.")
print(DIVIDER)
if __name__ == "__main__":
main()