gapguide-api / scripts /build_skill_embeddings.py
arifRB's picture
Deploy GapGuide backend (Docker)
ffd36e0 verified
Raw
History Blame Contribute Delete
3.09 kB
"""Populate the SkillEmbedding table with SBERT vectors for every catalog Skill.
Idempotent. Re-run after any `seed_initial_skills` change to refresh the
embeddings. One-shot script (not a management command) mirroring
`parse_onet_dump.py` style.
Usage:
python backend/scripts/build_skill_embeddings.py
Takes ~15 s for ~70 skills (model load dominates — the encode step itself
is <1 s). Writes via update_or_create keyed on Skill so existing rows get
overwritten with the new vector.
Requires:
* pgvector extension installed on the DB + migration 0003 applied.
* sentence-transformers + torch in the venv (requirements.txt).
* all-MiniLM-L6-v2 on disk or network accessible for first-call download.
"""
from __future__ import annotations
import os
import sys
from pathlib import Path
# Let this script run from anywhere — set up the backend module path first.
BACKEND_DIR = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(BACKEND_DIR))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
import django # noqa: E402
django.setup()
from django.db import transaction # noqa: E402
from apps.skills.models import Skill, SkillEmbedding # noqa: E402
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
def _encoding_text(skill: Skill) -> str:
"""What we embed for each skill.
Use ``skill_name + " — " + (description or category)`` so the embedding
carries both the exact name AND a short semantic bridge. "Python"
embedded alone sits near "python snake" and "Monty Python" in SBERT
space; "Python — Programming" stays inside the tech cluster.
"""
context = (skill.description or "").strip() or skill.category.strip()
if context:
return f"{skill.skill_name}{context}"
return skill.skill_name
def main() -> int:
# Lazy import so a failure here is visible, not an ImportError at the top.
from sentence_transformers import SentenceTransformer
skills = list(Skill.objects.all().order_by("id"))
if not skills:
print("No skills in the catalog — run seed_initial_skills first.")
return 1
print(f"Loading {MODEL_NAME}…")
model = SentenceTransformer(MODEL_NAME)
texts = [_encoding_text(s) for s in skills]
print(f"Encoding {len(texts)} skills…")
vectors = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
created = 0
updated = 0
with transaction.atomic():
for skill, text, vec in zip(skills, texts, vectors):
_, was_created = SkillEmbedding.objects.update_or_create(
skill=skill,
defaults={
"embedding": vec.tolist(),
"source_text": text,
"model_name": MODEL_NAME.rsplit("/", 1)[-1],
},
)
if was_created:
created += 1
else:
updated += 1
print(f"Done: {created} created, {updated} updated, "
f"{len(skills)} total.")
return 0
if __name__ == "__main__":
sys.exit(main())