Spaces:

arifRB
/

gapguide-api

Sleeping

App Files Files Community

gapguide-api / scripts /prefetch_models.py

arifRB

Deploy GapGuide backend (Docker)

ffd36e0 verified 15 days ago

Raw

History Blame Contribute Delete

1.83 kB

	"""Bake the NER-chain models into the Docker image at build time.

	Run as a Dockerfile RUN step so the ~1 GB of HF weights + spaCy + skillNer's
	nltk/EMSI assets are present in the image layers. The first resume parse in
	production is then fast and offline-safe instead of triggering a ~1 GB download
	on an ephemeral disk.

	Each load is wrapped so one flaky download cannot fail the whole image build —
	a skipped model simply falls back to a lazy download at runtime (slower first
	parse, but the deploy still succeeds). Model IDs mirror
	apps/accounts/ner/{nucha,jobbert,sbert,skillner}.py exactly. No Django/DB needed.
	"""
	from __future__ import annotations


	def _try(label, fn):
	try:
	fn()
	print(f"prefetch ok: {label}")
	except Exception as e: # noqa: BLE001 - intentional: never fail the build
	print(f"prefetch SKIPPED {label}: {e}")


	def main() -> None:
	from transformers import pipeline

	_try(
	"nucha",
	lambda: pipeline(
	task="ner",
	model="Nucha/Nucha_ITSkillNER_BERT",
	aggregation_strategy="simple",
	),
	)
	_try(
	"jobbert",
	lambda: pipeline(
	task="ner",
	model="jjzha/jobbert_skill_extraction",
	aggregation_strategy="simple",
	),
	)

	from sentence_transformers import SentenceTransformer

	_try("sbert", lambda: SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"))

	import spacy
	from skillNer.general_params import SKILL_DB
	from skillNer.skill_extractor_class import SkillExtractor
	from spacy.matcher import PhraseMatcher

	_try(
	"skillner",
	lambda: SkillExtractor(spacy.load("en_core_web_sm"), SKILL_DB, PhraseMatcher),
	)

	print("prefetch complete")


	if __name__ == "__main__":
	main()