"""Bake the NER-chain models into the Docker image at build time. Run as a Dockerfile RUN step so the ~1 GB of HF weights + spaCy + skillNer's nltk/EMSI assets are present in the image layers. The first resume parse in production is then fast and offline-safe instead of triggering a ~1 GB download on an ephemeral disk. Each load is wrapped so one flaky download cannot fail the whole image build — a skipped model simply falls back to a lazy download at runtime (slower first parse, but the deploy still succeeds). Model IDs mirror apps/accounts/ner/{nucha,jobbert,sbert,skillner}.py exactly. No Django/DB needed. """ from __future__ import annotations def _try(label, fn): try: fn() print(f"prefetch ok: {label}") except Exception as e: # noqa: BLE001 - intentional: never fail the build print(f"prefetch SKIPPED {label}: {e}") def main() -> None: from transformers import pipeline _try( "nucha", lambda: pipeline( task="ner", model="Nucha/Nucha_ITSkillNER_BERT", aggregation_strategy="simple", ), ) _try( "jobbert", lambda: pipeline( task="ner", model="jjzha/jobbert_skill_extraction", aggregation_strategy="simple", ), ) from sentence_transformers import SentenceTransformer _try("sbert", lambda: SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")) import spacy from skillNer.general_params import SKILL_DB from skillNer.skill_extractor_class import SkillExtractor from spacy.matcher import PhraseMatcher _try( "skillner", lambda: SkillExtractor(spacy.load("en_core_web_sm"), SKILL_DB, PhraseMatcher), ) print("prefetch complete") if __name__ == "__main__": main()