Spaces:

organicml
/

Algopharma_backend

Build error

File size: 5,620 Bytes

b34e73d

"""
AlgoPharma — Model Setup Script
================================
Run this ONCE after `uv sync` to download all required models to disk.

    uv run python setup_models.py

What it does:
  1. Downloads the spaCy English pipeline (en_core_web_sm - 12 MB)
  2. Pre-caches all HuggingFace models used at runtime:
       - Drug NER   : OpenMed-NER-PharmaDetect-ModernClinical-149M
       - Disease NER: OpenMed-NER-DiseaseDetect-SuperClinical-184M
       - Sentiment  : cardiffnlp/twitter-roberta-base-sentiment-latest
       - PII (en)   : OpenMed-PII-SuperClinical-Small-44M-v1
       - PII (hi)   : OpenMed-PII-Hindi-SuperClinical-Small-44M-v1
       - PII (te)   : OpenMed-PII-Telugu-FastClinical-Small-82M-v1

All models are saved to the default HuggingFace cache
(~/.cache/huggingface/hub) so Celery workers pick them up instantly.
"""

import subprocess
import sys
import logging

# Ensure UTF-8 output on Windows (cp1252 can't handle ─ and ✅)
if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8":
    sys.stdout.reconfigure(encoding="utf-8")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger("setup_models")

DIVIDER = "─" * 60

# ── 1. spaCy model ────────────────────────────────────────────
# en_core_web_sm  →  12 MB  (tok2vec + tagger + parser + NER)
# This is already the smallest production-grade English pipeline.
# en_core_web_md  →  43 MB  (adds word vectors)
# en_core_web_lg  → 741 MB  (larger vectors)   ← do NOT use
SPACY_MODEL = "en_core_web_sm"

# ── 2. HuggingFace models ─────────────────────────────────────
HF_MODELS = [
    # (repo_id, description, pipeline_task)
    (
        "OpenMed/OpenMed-NER-PharmaDetect-ModernClinical-149M",
        "Drug NER – 149M params, CPU-friendly",
        "token-classification",
    ),
    (
        "OpenMed/OpenMed-NER-DiseaseDetect-SuperClinical-184M",
        "Disease NER – 184M params, CPU-friendly",
        "token-classification",
    ),
    (
        "cardiffnlp/twitter-roberta-base-sentiment-latest",
        "Sentiment – 125M RoBERTa, medical-social-media fine-tuned",
        "sentiment-analysis",
    ),
    (
        "OpenMed/OpenMed-PII-SuperClinical-Small-44M-v1",
        "PII (English) – 44M DeBERTa, clinical de-identification",
        "token-classification",
    ),
    (
        "OpenMed/OpenMed-PII-Hindi-SuperClinical-Small-44M-v1",
        "PII (Hindi) – 44M, Hindi clinical de-identification",
        "token-classification",
    ),
    (
        "OpenMed/OpenMed-PII-Telugu-FastClinical-Small-82M-v1",
        "PII (Telugu) – 82M, Telugu clinical de-identification",
        "token-classification",
    ),
]


def _step(n: int, total: int, msg: str) -> None:
    logger.info(f"[{n}/{total}] {msg}")


def download_spacy(model: str) -> bool:
    """Download a spaCy model via subprocess (handles the pip-install step)."""
    logger.info(DIVIDER)
    logger.info(f"Downloading spaCy model: {model}")
    result = subprocess.run(
        [sys.executable, "-m", "spacy", "download", model],
        capture_output=False,
    )
    if result.returncode == 0:
        logger.info(f"✅ spaCy {model} ready")
        return True
    else:
        logger.error(f"❌ spaCy download failed for {model}")
        return False


def download_hf_model(repo_id: str, description: str, task: str) -> bool:
    """Warm the HuggingFace cache for a model by instantiating its pipeline."""
    logger.info(DIVIDER)
    logger.info(f"Downloading: {repo_id}")
    logger.info(f"  → {description}")
    try:
        from transformers import pipeline as hf_pipeline

        # force_download=False so re-runs are instant cache hits
        pipe = hf_pipeline(
            task,
            model=repo_id,
            device=-1,          # CPU only — no CUDA dependency
        )
        # Quick smoke-test to confirm model is functional
        if task == "token-classification":
            pipe("AlgoPharma setup test")
        else:
            pipe("This is a test")
        del pipe
        logger.info(f"✅ {repo_id} cached & verified")
        return True
    except Exception as e:
        logger.error(f"❌ Failed to download {repo_id}: {e}")
        return False


def main() -> None:
    total_steps = 1 + len(HF_MODELS)
    failures: list[str] = []

    print()
    print("=" * 60)
    print("  AlgoPharma — Model Setup")
    print("  Run once after: uv sync")
    print("=" * 60)
    print()

    # Step 1: spaCy
    _step(1, total_steps, f"spaCy — {SPACY_MODEL}")
    if not download_spacy(SPACY_MODEL):
        failures.append(SPACY_MODEL)

    # Steps 2+: HuggingFace models
    for i, (repo_id, desc, task) in enumerate(HF_MODELS, start=2):
        _step(i, total_steps, repo_id)
        if not download_hf_model(repo_id, desc, task):
            failures.append(repo_id)

    print()
    print(DIVIDER)
    if failures:
        logger.warning(f"⚠️  {len(failures)} model(s) failed to download:")
        for f in failures:
            logger.warning(f"   • {f}")
        logger.warning("Re-run this script or download manually.")
    else:
        logger.info(f"✅ All {total_steps} models downloaded and cached.")
        logger.info("You can now start the MCP server and Celery worker.")
    print(DIVIDER)


if __name__ == "__main__":
    main()