Spaces:
Build error
Build error
| """ | |
| AlgoPharma β Model Setup Script | |
| ================================ | |
| Run this ONCE after `uv sync` to download all required models to disk. | |
| uv run python setup_models.py | |
| What it does: | |
| 1. Downloads the spaCy English pipeline (en_core_web_sm - 12 MB) | |
| 2. Pre-caches all HuggingFace models used at runtime: | |
| - Drug NER : OpenMed-NER-PharmaDetect-ModernClinical-149M | |
| - Disease NER: OpenMed-NER-DiseaseDetect-SuperClinical-184M | |
| - Sentiment : cardiffnlp/twitter-roberta-base-sentiment-latest | |
| - PII (en) : OpenMed-PII-SuperClinical-Small-44M-v1 | |
| - PII (hi) : OpenMed-PII-Hindi-SuperClinical-Small-44M-v1 | |
| - PII (te) : OpenMed-PII-Telugu-FastClinical-Small-82M-v1 | |
| All models are saved to the default HuggingFace cache | |
| (~/.cache/huggingface/hub) so Celery workers pick them up instantly. | |
| """ | |
| import subprocess | |
| import sys | |
| import logging | |
| # Ensure UTF-8 output on Windows (cp1252 can't handle β and β ) | |
| if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8": | |
| sys.stdout.reconfigure(encoding="utf-8") | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s | %(levelname)s | %(message)s", | |
| datefmt="%H:%M:%S", | |
| handlers=[logging.StreamHandler(sys.stdout)], | |
| ) | |
| logger = logging.getLogger("setup_models") | |
| DIVIDER = "β" * 60 | |
| # ββ 1. spaCy model ββββββββββββββββββββββββββββββββββββββββββββ | |
| # en_core_web_sm β 12 MB (tok2vec + tagger + parser + NER) | |
| # This is already the smallest production-grade English pipeline. | |
| # en_core_web_md β 43 MB (adds word vectors) | |
| # en_core_web_lg β 741 MB (larger vectors) β do NOT use | |
| SPACY_MODEL = "en_core_web_sm" | |
| # ββ 2. HuggingFace models βββββββββββββββββββββββββββββββββββββ | |
| HF_MODELS = [ | |
| # (repo_id, description, pipeline_task) | |
| ( | |
| "OpenMed/OpenMed-NER-PharmaDetect-ModernClinical-149M", | |
| "Drug NER β 149M params, CPU-friendly", | |
| "token-classification", | |
| ), | |
| ( | |
| "OpenMed/OpenMed-NER-DiseaseDetect-SuperClinical-184M", | |
| "Disease NER β 184M params, CPU-friendly", | |
| "token-classification", | |
| ), | |
| ( | |
| "cardiffnlp/twitter-roberta-base-sentiment-latest", | |
| "Sentiment β 125M RoBERTa, medical-social-media fine-tuned", | |
| "sentiment-analysis", | |
| ), | |
| ( | |
| "OpenMed/OpenMed-PII-SuperClinical-Small-44M-v1", | |
| "PII (English) β 44M DeBERTa, clinical de-identification", | |
| "token-classification", | |
| ), | |
| ( | |
| "OpenMed/OpenMed-PII-Hindi-SuperClinical-Small-44M-v1", | |
| "PII (Hindi) β 44M, Hindi clinical de-identification", | |
| "token-classification", | |
| ), | |
| ( | |
| "OpenMed/OpenMed-PII-Telugu-FastClinical-Small-82M-v1", | |
| "PII (Telugu) β 82M, Telugu clinical de-identification", | |
| "token-classification", | |
| ), | |
| ] | |
| def _step(n: int, total: int, msg: str) -> None: | |
| logger.info(f"[{n}/{total}] {msg}") | |
| def download_spacy(model: str) -> bool: | |
| """Download a spaCy model via subprocess (handles the pip-install step).""" | |
| logger.info(DIVIDER) | |
| logger.info(f"Downloading spaCy model: {model}") | |
| result = subprocess.run( | |
| [sys.executable, "-m", "spacy", "download", model], | |
| capture_output=False, | |
| ) | |
| if result.returncode == 0: | |
| logger.info(f"β spaCy {model} ready") | |
| return True | |
| else: | |
| logger.error(f"β spaCy download failed for {model}") | |
| return False | |
| def download_hf_model(repo_id: str, description: str, task: str) -> bool: | |
| """Warm the HuggingFace cache for a model by instantiating its pipeline.""" | |
| logger.info(DIVIDER) | |
| logger.info(f"Downloading: {repo_id}") | |
| logger.info(f" β {description}") | |
| try: | |
| from transformers import pipeline as hf_pipeline | |
| # force_download=False so re-runs are instant cache hits | |
| pipe = hf_pipeline( | |
| task, | |
| model=repo_id, | |
| device=-1, # CPU only β no CUDA dependency | |
| ) | |
| # Quick smoke-test to confirm model is functional | |
| if task == "token-classification": | |
| pipe("AlgoPharma setup test") | |
| else: | |
| pipe("This is a test") | |
| del pipe | |
| logger.info(f"β {repo_id} cached & verified") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β Failed to download {repo_id}: {e}") | |
| return False | |
| def main() -> None: | |
| total_steps = 1 + len(HF_MODELS) | |
| failures: list[str] = [] | |
| print() | |
| print("=" * 60) | |
| print(" AlgoPharma β Model Setup") | |
| print(" Run once after: uv sync") | |
| print("=" * 60) | |
| print() | |
| # Step 1: spaCy | |
| _step(1, total_steps, f"spaCy β {SPACY_MODEL}") | |
| if not download_spacy(SPACY_MODEL): | |
| failures.append(SPACY_MODEL) | |
| # Steps 2+: HuggingFace models | |
| for i, (repo_id, desc, task) in enumerate(HF_MODELS, start=2): | |
| _step(i, total_steps, repo_id) | |
| if not download_hf_model(repo_id, desc, task): | |
| failures.append(repo_id) | |
| print() | |
| print(DIVIDER) | |
| if failures: | |
| logger.warning(f"β οΈ {len(failures)} model(s) failed to download:") | |
| for f in failures: | |
| logger.warning(f" β’ {f}") | |
| logger.warning("Re-run this script or download manually.") | |
| else: | |
| logger.info(f"β All {total_steps} models downloaded and cached.") | |
| logger.info("You can now start the MCP server and Celery worker.") | |
| print(DIVIDER) | |
| if __name__ == "__main__": | |
| main() | |