Spaces:
Build error
Build error
File size: 5,620 Bytes
b34e73d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | """
AlgoPharma β Model Setup Script
================================
Run this ONCE after `uv sync` to download all required models to disk.
uv run python setup_models.py
What it does:
1. Downloads the spaCy English pipeline (en_core_web_sm - 12 MB)
2. Pre-caches all HuggingFace models used at runtime:
- Drug NER : OpenMed-NER-PharmaDetect-ModernClinical-149M
- Disease NER: OpenMed-NER-DiseaseDetect-SuperClinical-184M
- Sentiment : cardiffnlp/twitter-roberta-base-sentiment-latest
- PII (en) : OpenMed-PII-SuperClinical-Small-44M-v1
- PII (hi) : OpenMed-PII-Hindi-SuperClinical-Small-44M-v1
- PII (te) : OpenMed-PII-Telugu-FastClinical-Small-82M-v1
All models are saved to the default HuggingFace cache
(~/.cache/huggingface/hub) so Celery workers pick them up instantly.
"""
import subprocess
import sys
import logging
# Ensure UTF-8 output on Windows (cp1252 can't handle β and β
)
if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8":
sys.stdout.reconfigure(encoding="utf-8")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger("setup_models")
DIVIDER = "β" * 60
# ββ 1. spaCy model ββββββββββββββββββββββββββββββββββββββββββββ
# en_core_web_sm β 12 MB (tok2vec + tagger + parser + NER)
# This is already the smallest production-grade English pipeline.
# en_core_web_md β 43 MB (adds word vectors)
# en_core_web_lg β 741 MB (larger vectors) β do NOT use
SPACY_MODEL = "en_core_web_sm"
# ββ 2. HuggingFace models βββββββββββββββββββββββββββββββββββββ
HF_MODELS = [
# (repo_id, description, pipeline_task)
(
"OpenMed/OpenMed-NER-PharmaDetect-ModernClinical-149M",
"Drug NER β 149M params, CPU-friendly",
"token-classification",
),
(
"OpenMed/OpenMed-NER-DiseaseDetect-SuperClinical-184M",
"Disease NER β 184M params, CPU-friendly",
"token-classification",
),
(
"cardiffnlp/twitter-roberta-base-sentiment-latest",
"Sentiment β 125M RoBERTa, medical-social-media fine-tuned",
"sentiment-analysis",
),
(
"OpenMed/OpenMed-PII-SuperClinical-Small-44M-v1",
"PII (English) β 44M DeBERTa, clinical de-identification",
"token-classification",
),
(
"OpenMed/OpenMed-PII-Hindi-SuperClinical-Small-44M-v1",
"PII (Hindi) β 44M, Hindi clinical de-identification",
"token-classification",
),
(
"OpenMed/OpenMed-PII-Telugu-FastClinical-Small-82M-v1",
"PII (Telugu) β 82M, Telugu clinical de-identification",
"token-classification",
),
]
def _step(n: int, total: int, msg: str) -> None:
logger.info(f"[{n}/{total}] {msg}")
def download_spacy(model: str) -> bool:
"""Download a spaCy model via subprocess (handles the pip-install step)."""
logger.info(DIVIDER)
logger.info(f"Downloading spaCy model: {model}")
result = subprocess.run(
[sys.executable, "-m", "spacy", "download", model],
capture_output=False,
)
if result.returncode == 0:
logger.info(f"β
spaCy {model} ready")
return True
else:
logger.error(f"β spaCy download failed for {model}")
return False
def download_hf_model(repo_id: str, description: str, task: str) -> bool:
"""Warm the HuggingFace cache for a model by instantiating its pipeline."""
logger.info(DIVIDER)
logger.info(f"Downloading: {repo_id}")
logger.info(f" β {description}")
try:
from transformers import pipeline as hf_pipeline
# force_download=False so re-runs are instant cache hits
pipe = hf_pipeline(
task,
model=repo_id,
device=-1, # CPU only β no CUDA dependency
)
# Quick smoke-test to confirm model is functional
if task == "token-classification":
pipe("AlgoPharma setup test")
else:
pipe("This is a test")
del pipe
logger.info(f"β
{repo_id} cached & verified")
return True
except Exception as e:
logger.error(f"β Failed to download {repo_id}: {e}")
return False
def main() -> None:
total_steps = 1 + len(HF_MODELS)
failures: list[str] = []
print()
print("=" * 60)
print(" AlgoPharma β Model Setup")
print(" Run once after: uv sync")
print("=" * 60)
print()
# Step 1: spaCy
_step(1, total_steps, f"spaCy β {SPACY_MODEL}")
if not download_spacy(SPACY_MODEL):
failures.append(SPACY_MODEL)
# Steps 2+: HuggingFace models
for i, (repo_id, desc, task) in enumerate(HF_MODELS, start=2):
_step(i, total_steps, repo_id)
if not download_hf_model(repo_id, desc, task):
failures.append(repo_id)
print()
print(DIVIDER)
if failures:
logger.warning(f"β οΈ {len(failures)} model(s) failed to download:")
for f in failures:
logger.warning(f" β’ {f}")
logger.warning("Re-run this script or download manually.")
else:
logger.info(f"β
All {total_steps} models downloaded and cached.")
logger.info("You can now start the MCP server and Celery worker.")
print(DIVIDER)
if __name__ == "__main__":
main()
|