"""Populate the SkillEmbedding table with SBERT vectors for every catalog Skill. Idempotent. Re-run after any `seed_initial_skills` change to refresh the embeddings. One-shot script (not a management command) mirroring `parse_onet_dump.py` style. Usage: python backend/scripts/build_skill_embeddings.py Takes ~15 s for ~70 skills (model load dominates — the encode step itself is <1 s). Writes via update_or_create keyed on Skill so existing rows get overwritten with the new vector. Requires: * pgvector extension installed on the DB + migration 0003 applied. * sentence-transformers + torch in the venv (requirements.txt). * all-MiniLM-L6-v2 on disk or network accessible for first-call download. """ from __future__ import annotations import os import sys from pathlib import Path # Let this script run from anywhere — set up the backend module path first. BACKEND_DIR = Path(__file__).resolve().parent.parent sys.path.insert(0, str(BACKEND_DIR)) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings") import django # noqa: E402 django.setup() from django.db import transaction # noqa: E402 from apps.skills.models import Skill, SkillEmbedding # noqa: E402 MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" def _encoding_text(skill: Skill) -> str: """What we embed for each skill. Use ``skill_name + " — " + (description or category)`` so the embedding carries both the exact name AND a short semantic bridge. "Python" embedded alone sits near "python snake" and "Monty Python" in SBERT space; "Python — Programming" stays inside the tech cluster. """ context = (skill.description or "").strip() or skill.category.strip() if context: return f"{skill.skill_name} — {context}" return skill.skill_name def main() -> int: # Lazy import so a failure here is visible, not an ImportError at the top. from sentence_transformers import SentenceTransformer skills = list(Skill.objects.all().order_by("id")) if not skills: print("No skills in the catalog — run seed_initial_skills first.") return 1 print(f"Loading {MODEL_NAME}…") model = SentenceTransformer(MODEL_NAME) texts = [_encoding_text(s) for s in skills] print(f"Encoding {len(texts)} skills…") vectors = model.encode(texts, normalize_embeddings=True, show_progress_bar=False) created = 0 updated = 0 with transaction.atomic(): for skill, text, vec in zip(skills, texts, vectors): _, was_created = SkillEmbedding.objects.update_or_create( skill=skill, defaults={ "embedding": vec.tolist(), "source_text": text, "model_name": MODEL_NAME.rsplit("/", 1)[-1], }, ) if was_created: created += 1 else: updated += 1 print(f"Done: {created} created, {updated} updated, " f"{len(skills)} total.") return 0 if __name__ == "__main__": sys.exit(main())