# skill_extraction.py import json import spacy import numpy as np from sentence_transformers import SentenceTransformer from spacy.cli import download # Ensure spaCy model is available try: nlp = spacy.load("en_core_web_sm") except OSError: download("en_core_web_sm") nlp = spacy.load("en_core_web_sm") # Embedding model model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # Load skills from JSON with open("skills_vocab.json", "r", encoding="utf-8") as f: SKILL_VOCAB = json.load(f)["skills"] # Precompute embeddings skill_embeddings = model.encode(SKILL_VOCAB, convert_to_numpy=True, normalize_embeddings=True) # Map skill -> embedding for quick lookup SKILL_TO_EMB = {skill: emb for skill, emb in zip(SKILL_VOCAB, skill_embeddings)} def extract_skills(text, threshold=0.50): """ Extract skills from text. Returns list of (skill, confidence, evidence_sentences) """ doc = nlp(text) sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()] if not sentences: return [] sentence_embeddings = model.encode(sentences, convert_to_numpy=True, normalize_embeddings=True) skill_confidences = {} skill_evidence = {} # Iterate skill-by-skill for j, skill_emb in enumerate(skill_embeddings): sims = np.dot(sentence_embeddings, skill_emb) # similarity across sentences max_sim_idx = int(np.argmax(sims)) max_sim = float(sims[max_sim_idx]) if max_sim >= threshold: skill = SKILL_VOCAB[j] skill_confidences[skill] = round(max_sim, 2) # Take the sentence with highest similarity as evidence skill_evidence[skill] = sentences[max_sim_idx] # Return sorted list of (skill, confidence, evidence) results = [(skill, skill_confidences[skill], skill_evidence[skill]) for skill in skill_confidences] results.sort(key=lambda x: x[1], reverse=True) return results # --- NEW FUNCTION --- def get_skill_embedding(skill_name): """Return the embedding vector for a skill, or None if not in vocab""" return SKILL_TO_EMB.get(skill_name)