gapguide-api / scripts /prefetch_models.py
arifRB's picture
Deploy GapGuide backend (Docker)
ffd36e0 verified
Raw
History Blame Contribute Delete
1.83 kB
"""Bake the NER-chain models into the Docker image at build time.
Run as a Dockerfile RUN step so the ~1 GB of HF weights + spaCy + skillNer's
nltk/EMSI assets are present in the image layers. The first resume parse in
production is then fast and offline-safe instead of triggering a ~1 GB download
on an ephemeral disk.
Each load is wrapped so one flaky download cannot fail the whole image build —
a skipped model simply falls back to a lazy download at runtime (slower first
parse, but the deploy still succeeds). Model IDs mirror
apps/accounts/ner/{nucha,jobbert,sbert,skillner}.py exactly. No Django/DB needed.
"""
from __future__ import annotations
def _try(label, fn):
try:
fn()
print(f"prefetch ok: {label}")
except Exception as e: # noqa: BLE001 - intentional: never fail the build
print(f"prefetch SKIPPED {label}: {e}")
def main() -> None:
from transformers import pipeline
_try(
"nucha",
lambda: pipeline(
task="ner",
model="Nucha/Nucha_ITSkillNER_BERT",
aggregation_strategy="simple",
),
)
_try(
"jobbert",
lambda: pipeline(
task="ner",
model="jjzha/jobbert_skill_extraction",
aggregation_strategy="simple",
),
)
from sentence_transformers import SentenceTransformer
_try("sbert", lambda: SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"))
import spacy
from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor
from spacy.matcher import PhraseMatcher
_try(
"skillner",
lambda: SkillExtractor(spacy.load("en_core_web_sm"), SKILL_DB, PhraseMatcher),
)
print("prefetch complete")
if __name__ == "__main__":
main()