"""Populate the SkillEmbedding table with SBERT vectors for every catalog Skill.

Idempotent. Re-run after any `seed_initial_skills` change to refresh the
embeddings. One-shot script (not a management command) mirroring
`parse_onet_dump.py` style.

Usage:
    python backend/scripts/build_skill_embeddings.py

Takes ~15 s for ~70 skills (model load dominates — the encode step itself
is <1 s). Writes via update_or_create keyed on Skill so existing rows get
overwritten with the new vector.

Requires:
  * pgvector extension installed on the DB + migration 0003 applied.
  * sentence-transformers + torch in the venv (requirements.txt).
  * all-MiniLM-L6-v2 on disk or network accessible for first-call download.
"""
from __future__ import annotations

import os
import sys
from pathlib import Path

# Let this script run from anywhere — set up the backend module path first.
BACKEND_DIR = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(BACKEND_DIR))

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")

import django  # noqa: E402
django.setup()

from django.db import transaction  # noqa: E402

from apps.skills.models import Skill, SkillEmbedding  # noqa: E402


MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"


def _encoding_text(skill: Skill) -> str:
    """What we embed for each skill.

    Use ``skill_name + " — " + (description or category)`` so the embedding
    carries both the exact name AND a short semantic bridge. "Python"
    embedded alone sits near "python snake" and "Monty Python" in SBERT
    space; "Python — Programming" stays inside the tech cluster.
    """
    context = (skill.description or "").strip() or skill.category.strip()
    if context:
        return f"{skill.skill_name} — {context}"
    return skill.skill_name


def main() -> int:
    # Lazy import so a failure here is visible, not an ImportError at the top.
    from sentence_transformers import SentenceTransformer

    skills = list(Skill.objects.all().order_by("id"))
    if not skills:
        print("No skills in the catalog — run seed_initial_skills first.")
        return 1

    print(f"Loading {MODEL_NAME}…")
    model = SentenceTransformer(MODEL_NAME)

    texts = [_encoding_text(s) for s in skills]
    print(f"Encoding {len(texts)} skills…")
    vectors = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)

    created = 0
    updated = 0
    with transaction.atomic():
        for skill, text, vec in zip(skills, texts, vectors):
            _, was_created = SkillEmbedding.objects.update_or_create(
                skill=skill,
                defaults={
                    "embedding": vec.tolist(),
                    "source_text": text,
                    "model_name": MODEL_NAME.rsplit("/", 1)[-1],
                },
            )
            if was_created:
                created += 1
            else:
                updated += 1

    print(f"Done: {created} created, {updated} updated, "
          f"{len(skills)} total.")
    return 0


if __name__ == "__main__":
    sys.exit(main())