Spaces:

arifRB
/

gapguide-api

Sleeping

App Files Files Community

gapguide-api / scripts /build_skill_embeddings.py

arifRB

Deploy GapGuide backend (Docker)

ffd36e0 verified 15 days ago

Raw

History Blame Contribute Delete

3.09 kB

	"""Populate the SkillEmbedding table with SBERT vectors for every catalog Skill.

	Idempotent. Re-run after any `seed_initial_skills` change to refresh the
	embeddings. One-shot script (not a management command) mirroring
	`parse_onet_dump.py` style.

	Usage:
	python backend/scripts/build_skill_embeddings.py

	Takes ~15 s for ~70 skills (model load dominates — the encode step itself
	is <1 s). Writes via update_or_create keyed on Skill so existing rows get
	overwritten with the new vector.

	Requires:
	* pgvector extension installed on the DB + migration 0003 applied.
	* sentence-transformers + torch in the venv (requirements.txt).
	* all-MiniLM-L6-v2 on disk or network accessible for first-call download.
	"""
	from __future__ import annotations

	import os
	import sys
	from pathlib import Path

	# Let this script run from anywhere — set up the backend module path first.
	BACKEND_DIR = Path(__file__).resolve().parent.parent
	sys.path.insert(0, str(BACKEND_DIR))

	os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")

	import django # noqa: E402
	django.setup()

	from django.db import transaction # noqa: E402

	from apps.skills.models import Skill, SkillEmbedding # noqa: E402


	MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"


	def _encoding_text(skill: Skill) -> str:
	"""What we embed for each skill.

	Use ``skill_name + " — " + (description or category)`` so the embedding
	carries both the exact name AND a short semantic bridge. "Python"
	embedded alone sits near "python snake" and "Monty Python" in SBERT
	space; "Python — Programming" stays inside the tech cluster.
	"""
	context = (skill.description or "").strip() or skill.category.strip()
	if context:
	return f"{skill.skill_name} — {context}"
	return skill.skill_name


	def main() -> int:
	# Lazy import so a failure here is visible, not an ImportError at the top.
	from sentence_transformers import SentenceTransformer

	skills = list(Skill.objects.all().order_by("id"))
	if not skills:
	print("No skills in the catalog — run seed_initial_skills first.")
	return 1

	print(f"Loading {MODEL_NAME}…")
	model = SentenceTransformer(MODEL_NAME)

	texts = [_encoding_text(s) for s in skills]
	print(f"Encoding {len(texts)} skills…")
	vectors = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)

	created = 0
	updated = 0
	with transaction.atomic():
	for skill, text, vec in zip(skills, texts, vectors):
	_, was_created = SkillEmbedding.objects.update_or_create(
	skill=skill,
	defaults={
	"embedding": vec.tolist(),
	"source_text": text,
	"model_name": MODEL_NAME.rsplit("/", 1)[-1],
	},
	)
	if was_created:
	created += 1
	else:
	updated += 1

	print(f"Done: {created} created, {updated} updated, "
	f"{len(skills)} total.")
	return 0


	if __name__ == "__main__":
	sys.exit(main())