Spaces:
Sleeping
Sleeping
| """Populate the SkillEmbedding table with SBERT vectors for every catalog Skill. | |
| Idempotent. Re-run after any `seed_initial_skills` change to refresh the | |
| embeddings. One-shot script (not a management command) mirroring | |
| `parse_onet_dump.py` style. | |
| Usage: | |
| python backend/scripts/build_skill_embeddings.py | |
| Takes ~15 s for ~70 skills (model load dominates — the encode step itself | |
| is <1 s). Writes via update_or_create keyed on Skill so existing rows get | |
| overwritten with the new vector. | |
| Requires: | |
| * pgvector extension installed on the DB + migration 0003 applied. | |
| * sentence-transformers + torch in the venv (requirements.txt). | |
| * all-MiniLM-L6-v2 on disk or network accessible for first-call download. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import sys | |
| from pathlib import Path | |
| # Let this script run from anywhere — set up the backend module path first. | |
| BACKEND_DIR = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(BACKEND_DIR)) | |
| os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings") | |
| import django # noqa: E402 | |
| django.setup() | |
| from django.db import transaction # noqa: E402 | |
| from apps.skills.models import Skill, SkillEmbedding # noqa: E402 | |
| MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
| def _encoding_text(skill: Skill) -> str: | |
| """What we embed for each skill. | |
| Use ``skill_name + " — " + (description or category)`` so the embedding | |
| carries both the exact name AND a short semantic bridge. "Python" | |
| embedded alone sits near "python snake" and "Monty Python" in SBERT | |
| space; "Python — Programming" stays inside the tech cluster. | |
| """ | |
| context = (skill.description or "").strip() or skill.category.strip() | |
| if context: | |
| return f"{skill.skill_name} — {context}" | |
| return skill.skill_name | |
| def main() -> int: | |
| # Lazy import so a failure here is visible, not an ImportError at the top. | |
| from sentence_transformers import SentenceTransformer | |
| skills = list(Skill.objects.all().order_by("id")) | |
| if not skills: | |
| print("No skills in the catalog — run seed_initial_skills first.") | |
| return 1 | |
| print(f"Loading {MODEL_NAME}…") | |
| model = SentenceTransformer(MODEL_NAME) | |
| texts = [_encoding_text(s) for s in skills] | |
| print(f"Encoding {len(texts)} skills…") | |
| vectors = model.encode(texts, normalize_embeddings=True, show_progress_bar=False) | |
| created = 0 | |
| updated = 0 | |
| with transaction.atomic(): | |
| for skill, text, vec in zip(skills, texts, vectors): | |
| _, was_created = SkillEmbedding.objects.update_or_create( | |
| skill=skill, | |
| defaults={ | |
| "embedding": vec.tolist(), | |
| "source_text": text, | |
| "model_name": MODEL_NAME.rsplit("/", 1)[-1], | |
| }, | |
| ) | |
| if was_created: | |
| created += 1 | |
| else: | |
| updated += 1 | |
| print(f"Done: {created} created, {updated} updated, " | |
| f"{len(skills)} total.") | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |